From 0e01c72c5645259d9a08a1a7ed39cb5cc41ce311 Mon Sep 17 00:00:00 2001
From: yjijd <licongtian@loongson.cn>
Date: Tue, 2 Jan 2024 11:46:00 +0800
Subject: [PATCH 001/313] [Clang][LoongArch] Do not pass vector arguments via
 vector registers (#74990)

psABI v2.30 clarifies that vector arguments are passed according to the
base ABI by default.
---
 clang/lib/CodeGen/Targets/LoongArch.cpp       |    7 -
 .../CodeGen/LoongArch/lasx/builtin-alias.c    | 4876 ++++++++++++-----
 clang/test/CodeGen/LoongArch/lasx/builtin.c   | 4874 +++++++++++-----
 .../CodeGen/LoongArch/lsx/builtin-alias.c     | 4746 +++++++++++-----
 clang/test/CodeGen/LoongArch/lsx/builtin.c    | 4746 +++++++++++-----
 5 files changed, 13485 insertions(+), 5764 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 7b2c31139b0b2..63b9a1fdb988c 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -324,13 +324,6 @@ ABIArgInfo LoongArchABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
     return ABIArgInfo::getDirect();
   }
 
-  // Pass 128-bit/256-bit vector values via vector registers directly.
-  if (Ty->isVectorType() && (((getContext().getTypeSize(Ty) == 128) &&
-                              (getTarget().hasFeature("lsx"))) ||
-                             ((getContext().getTypeSize(Ty) == 256) &&
-                              getTarget().hasFeature("lasx"))))
-    return ABIArgInfo::getDirect();
-
   // Complex types for the *f or *d ABI must be passed directly rather than
   // using CoerceAndExpand.
   if (IsFixed && Ty->isComplexType() && FRLen && FARsLeft >= 2) {
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
index 09b2d5fcacf53..9a8ce224bcfd0 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
@@ -5,4426 +5,6382 @@
 
 // CHECK-LABEL: @xvsll_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __lasx_xvsll_b(_1, _2); }
 // CHECK-LABEL: @xvsll_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __lasx_xvsll_h(_1, _2); }
 // CHECK-LABEL: @xvsll_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __lasx_xvsll_w(_1, _2); }
 // CHECK-LABEL: @xvsll_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __lasx_xvsll_d(_1, _2); }
 // CHECK-LABEL: @xvslli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslli_b(v32i8 _1) { return __lasx_xvslli_b(_1, 1); }
 // CHECK-LABEL: @xvslli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslli_h(v16i16 _1) { return __lasx_xvslli_h(_1, 1); }
 // CHECK-LABEL: @xvslli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslli_w(v8i32 _1) { return __lasx_xvslli_w(_1, 1); }
 // CHECK-LABEL: @xvslli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslli_d(v4i64 _1) { return __lasx_xvslli_d(_1, 1); }
 // CHECK-LABEL: @xvsra_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __lasx_xvsra_b(_1, _2); }
 // CHECK-LABEL: @xvsra_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __lasx_xvsra_h(_1, _2); }
 // CHECK-LABEL: @xvsra_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __lasx_xvsra_w(_1, _2); }
 // CHECK-LABEL: @xvsra_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __lasx_xvsra_d(_1, _2); }
 // CHECK-LABEL: @xvsrai_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrai_b(v32i8 _1) { return __lasx_xvsrai_b(_1, 1); }
 // CHECK-LABEL: @xvsrai_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrai_h(v16i16 _1) { return __lasx_xvsrai_h(_1, 1); }
 // CHECK-LABEL: @xvsrai_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrai_w(v8i32 _1) { return __lasx_xvsrai_w(_1, 1); }
 // CHECK-LABEL: @xvsrai_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrai_d(v4i64 _1) { return __lasx_xvsrai_d(_1, 1); }
 // CHECK-LABEL: @xvsrar_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrar_b(_1, _2); }
 // CHECK-LABEL: @xvsrar_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrar_h(_1, _2); }
 // CHECK-LABEL: @xvsrar_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrar_w(_1, _2); }
 // CHECK-LABEL: @xvsrar_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrar_d(_1, _2); }
 // CHECK-LABEL: @xvsrari_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrari_b(v32i8 _1) { return __lasx_xvsrari_b(_1, 1); }
 // CHECK-LABEL: @xvsrari_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrari_h(v16i16 _1) { return __lasx_xvsrari_h(_1, 1); }
 // CHECK-LABEL: @xvsrari_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrari_w(v8i32 _1) { return __lasx_xvsrari_w(_1, 1); }
 // CHECK-LABEL: @xvsrari_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrari_d(v4i64 _1) { return __lasx_xvsrari_d(_1, 1); }
 // CHECK-LABEL: @xvsrl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrl_b(_1, _2); }
 // CHECK-LABEL: @xvsrl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrl_h(_1, _2); }
 // CHECK-LABEL: @xvsrl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrl_w(_1, _2); }
 // CHECK-LABEL: @xvsrl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrl_d(_1, _2); }
 // CHECK-LABEL: @xvsrli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrli_b(v32i8 _1) { return __lasx_xvsrli_b(_1, 1); }
 // CHECK-LABEL: @xvsrli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrli_h(v16i16 _1) { return __lasx_xvsrli_h(_1, 1); }
 // CHECK-LABEL: @xvsrli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrli_w(v8i32 _1) { return __lasx_xvsrli_w(_1, 1); }
 // CHECK-LABEL: @xvsrli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrli_d(v4i64 _1) { return __lasx_xvsrli_d(_1, 1); }
 // CHECK-LABEL: @xvsrlr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrlr_b(_1, _2); }
 // CHECK-LABEL: @xvsrlr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlr_h(_1, _2); }
 // CHECK-LABEL: @xvsrlr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlr_w(_1, _2); }
 // CHECK-LABEL: @xvsrlr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlr_d(_1, _2); }
 // CHECK-LABEL: @xvsrlri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlri_b(v32i8 _1) { return __lasx_xvsrlri_b(_1, 1); }
 // CHECK-LABEL: @xvsrlri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlri_h(v16i16 _1) { return __lasx_xvsrlri_h(_1, 1); }
 // CHECK-LABEL: @xvsrlri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlri_w(v8i32 _1) { return __lasx_xvsrlri_w(_1, 1); }
 // CHECK-LABEL: @xvsrlri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlri_d(v4i64 _1) { return __lasx_xvsrlri_d(_1, 1); }
 // CHECK-LABEL: @xvbitclr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitclr_b(_1, _2); }
 // CHECK-LABEL: @xvbitclr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitclr_h(_1, _2); }
 // CHECK-LABEL: @xvbitclr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitclr_w(_1, _2); }
 // CHECK-LABEL: @xvbitclr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitclr_d(_1, _2); }
 // CHECK-LABEL: @xvbitclri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitclri_b(v32u8 _1) { return __lasx_xvbitclri_b(_1, 1); }
 // CHECK-LABEL: @xvbitclri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitclri_h(v16u16 _1) { return __lasx_xvbitclri_h(_1, 1); }
 // CHECK-LABEL: @xvbitclri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitclri_w(v8u32 _1) { return __lasx_xvbitclri_w(_1, 1); }
 // CHECK-LABEL: @xvbitclri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitclri_d(v4u64 _1) { return __lasx_xvbitclri_d(_1, 1); }
 // CHECK-LABEL: @xvbitset_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitset_b(_1, _2); }
 // CHECK-LABEL: @xvbitset_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitset_h(_1, _2); }
 // CHECK-LABEL: @xvbitset_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitset_w(_1, _2); }
 // CHECK-LABEL: @xvbitset_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitset_d(_1, _2); }
 // CHECK-LABEL: @xvbitseti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitseti_b(v32u8 _1) { return __lasx_xvbitseti_b(_1, 1); }
 // CHECK-LABEL: @xvbitseti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitseti_h(v16u16 _1) { return __lasx_xvbitseti_h(_1, 1); }
 // CHECK-LABEL: @xvbitseti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitseti_w(v8u32 _1) { return __lasx_xvbitseti_w(_1, 1); }
 // CHECK-LABEL: @xvbitseti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitseti_d(v4u64 _1) { return __lasx_xvbitseti_d(_1, 1); }
 // CHECK-LABEL: @xvbitrev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitrev_b(_1, _2); }
 // CHECK-LABEL: @xvbitrev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitrev_h(_1, _2); }
 // CHECK-LABEL: @xvbitrev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitrev_w(_1, _2); }
 // CHECK-LABEL: @xvbitrev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitrev_d(_1, _2); }
 // CHECK-LABEL: @xvbitrevi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitrevi_b(v32u8 _1) { return __lasx_xvbitrevi_b(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitrevi_h(v16u16 _1) { return __lasx_xvbitrevi_h(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitrevi_w(v8u32 _1) { return __lasx_xvbitrevi_w(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitrevi_d(v4u64 _1) { return __lasx_xvbitrevi_d(_1, 1); }
 // CHECK-LABEL: @xvadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvadd_b(_1, _2); }
 // CHECK-LABEL: @xvadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvadd_h(_1, _2); }
 // CHECK-LABEL: @xvadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvadd_w(_1, _2); }
 // CHECK-LABEL: @xvadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvadd_d(_1, _2); }
 // CHECK-LABEL: @xvaddi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvaddi_bu(v32i8 _1) { return __lasx_xvaddi_bu(_1, 1); }
 // CHECK-LABEL: @xvaddi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddi_hu(v16i16 _1) { return __lasx_xvaddi_hu(_1, 1); }
 // CHECK-LABEL: @xvaddi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddi_wu(v8i32 _1) { return __lasx_xvaddi_wu(_1, 1); }
 // CHECK-LABEL: @xvaddi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddi_du(v4i64 _1) { return __lasx_xvaddi_du(_1, 1); }
 // CHECK-LABEL: @xvsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __lasx_xvsub_b(_1, _2); }
 // CHECK-LABEL: @xvsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __lasx_xvsub_h(_1, _2); }
 // CHECK-LABEL: @xvsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __lasx_xvsub_w(_1, _2); }
 // CHECK-LABEL: @xvsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __lasx_xvsub_d(_1, _2); }
 // CHECK-LABEL: @xvsubi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsubi_bu(v32i8 _1) { return __lasx_xvsubi_bu(_1, 1); }
 // CHECK-LABEL: @xvsubi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubi_hu(v16i16 _1) { return __lasx_xvsubi_hu(_1, 1); }
 // CHECK-LABEL: @xvsubi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubi_wu(v8i32 _1) { return __lasx_xvsubi_wu(_1, 1); }
 // CHECK-LABEL: @xvsubi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubi_du(v4i64 _1) { return __lasx_xvsubi_du(_1, 1); }
 // CHECK-LABEL: @xvmax_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __lasx_xvmax_b(_1, _2); }
 // CHECK-LABEL: @xvmax_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __lasx_xvmax_h(_1, _2); }
 // CHECK-LABEL: @xvmax_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __lasx_xvmax_w(_1, _2); }
 // CHECK-LABEL: @xvmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __lasx_xvmax_d(_1, _2); }
 // CHECK-LABEL: @xvmaxi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmaxi_b(v32i8 _1) { return __lasx_xvmaxi_b(_1, 1); }
 // CHECK-LABEL: @xvmaxi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaxi_h(v16i16 _1) { return __lasx_xvmaxi_h(_1, 1); }
 // CHECK-LABEL: @xvmaxi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaxi_w(v8i32 _1) { return __lasx_xvmaxi_w(_1, 1); }
 // CHECK-LABEL: @xvmaxi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaxi_d(v4i64 _1) { return __lasx_xvmaxi_d(_1, 1); }
 // CHECK-LABEL: @xvmax_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmax_bu(_1, _2); }
 // CHECK-LABEL: @xvmax_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmax_hu(_1, _2); }
 // CHECK-LABEL: @xvmax_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmax_wu(_1, _2); }
 // CHECK-LABEL: @xvmax_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __lasx_xvmax_du(_1, _2); }
 // CHECK-LABEL: @xvmaxi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmaxi_bu(v32u8 _1) { return __lasx_xvmaxi_bu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaxi_hu(v16u16 _1) { return __lasx_xvmaxi_hu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaxi_wu(v8u32 _1) { return __lasx_xvmaxi_wu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaxi_du(v4u64 _1) { return __lasx_xvmaxi_du(_1, 1); }
 // CHECK-LABEL: @xvmin_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __lasx_xvmin_b(_1, _2); }
 // CHECK-LABEL: @xvmin_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __lasx_xvmin_h(_1, _2); }
 // CHECK-LABEL: @xvmin_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __lasx_xvmin_w(_1, _2); }
 // CHECK-LABEL: @xvmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __lasx_xvmin_d(_1, _2); }
 // CHECK-LABEL: @xvmini_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmini_b(v32i8 _1) { return __lasx_xvmini_b(_1, 1); }
 // CHECK-LABEL: @xvmini_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmini_h(v16i16 _1) { return __lasx_xvmini_h(_1, 1); }
 // CHECK-LABEL: @xvmini_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmini_w(v8i32 _1) { return __lasx_xvmini_w(_1, 1); }
 // CHECK-LABEL: @xvmini_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmini_d(v4i64 _1) { return __lasx_xvmini_d(_1, 1); }
 // CHECK-LABEL: @xvmin_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmin_bu(_1, _2); }
 // CHECK-LABEL: @xvmin_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmin_hu(_1, _2); }
 // CHECK-LABEL: @xvmin_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmin_wu(_1, _2); }
 // CHECK-LABEL: @xvmin_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __lasx_xvmin_du(_1, _2); }
 // CHECK-LABEL: @xvmini_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmini_bu(v32u8 _1) { return __lasx_xvmini_bu(_1, 1); }
 // CHECK-LABEL: @xvmini_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmini_hu(v16u16 _1) { return __lasx_xvmini_hu(_1, 1); }
 // CHECK-LABEL: @xvmini_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmini_wu(v8u32 _1) { return __lasx_xvmini_wu(_1, 1); }
 // CHECK-LABEL: @xvmini_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmini_du(v4u64 _1) { return __lasx_xvmini_du(_1, 1); }
 // CHECK-LABEL: @xvseq_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __lasx_xvseq_b(_1, _2); }
 // CHECK-LABEL: @xvseq_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __lasx_xvseq_h(_1, _2); }
 // CHECK-LABEL: @xvseq_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __lasx_xvseq_w(_1, _2); }
 // CHECK-LABEL: @xvseq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __lasx_xvseq_d(_1, _2); }
 // CHECK-LABEL: @xvseqi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvseqi_b(v32i8 _1) { return __lasx_xvseqi_b(_1, 1); }
 // CHECK-LABEL: @xvseqi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvseqi_h(v16i16 _1) { return __lasx_xvseqi_h(_1, 1); }
 // CHECK-LABEL: @xvseqi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvseqi_w(v8i32 _1) { return __lasx_xvseqi_w(_1, 1); }
 // CHECK-LABEL: @xvseqi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvseqi_d(v4i64 _1) { return __lasx_xvseqi_d(_1, 1); }
 // CHECK-LABEL: @xvslt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __lasx_xvslt_b(_1, _2); }
 // CHECK-LABEL: @xvslt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __lasx_xvslt_h(_1, _2); }
 // CHECK-LABEL: @xvslt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __lasx_xvslt_w(_1, _2); }
 // CHECK-LABEL: @xvslt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __lasx_xvslt_d(_1, _2); }
 // CHECK-LABEL: @xvslti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslti_b(v32i8 _1) { return __lasx_xvslti_b(_1, 1); }
 // CHECK-LABEL: @xvslti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslti_h(v16i16 _1) { return __lasx_xvslti_h(_1, 1); }
 // CHECK-LABEL: @xvslti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslti_w(v8i32 _1) { return __lasx_xvslti_w(_1, 1); }
 // CHECK-LABEL: @xvslti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslti_d(v4i64 _1) { return __lasx_xvslti_d(_1, 1); }
 // CHECK-LABEL: @xvslt_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __lasx_xvslt_bu(_1, _2); }
 // CHECK-LABEL: @xvslt_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __lasx_xvslt_hu(_1, _2); }
 // CHECK-LABEL: @xvslt_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __lasx_xvslt_wu(_1, _2); }
 // CHECK-LABEL: @xvslt_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __lasx_xvslt_du(_1, _2); }
 // CHECK-LABEL: @xvslti_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslti_bu(v32u8 _1) { return __lasx_xvslti_bu(_1, 1); }
 // CHECK-LABEL: @xvslti_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslti_hu(v16u16 _1) { return __lasx_xvslti_hu(_1, 1); }
 // CHECK-LABEL: @xvslti_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslti_wu(v8u32 _1) { return __lasx_xvslti_wu(_1, 1); }
 // CHECK-LABEL: @xvslti_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslti_du(v4u64 _1) { return __lasx_xvslti_du(_1, 1); }
 // CHECK-LABEL: @xvsle_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __lasx_xvsle_b(_1, _2); }
 // CHECK-LABEL: @xvsle_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __lasx_xvsle_h(_1, _2); }
 // CHECK-LABEL: @xvsle_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __lasx_xvsle_w(_1, _2); }
 // CHECK-LABEL: @xvsle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __lasx_xvsle_d(_1, _2); }
 // CHECK-LABEL: @xvslei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslei_b(v32i8 _1) { return __lasx_xvslei_b(_1, 1); }
 // CHECK-LABEL: @xvslei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslei_h(v16i16 _1) { return __lasx_xvslei_h(_1, 1); }
 // CHECK-LABEL: @xvslei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslei_w(v8i32 _1) { return __lasx_xvslei_w(_1, 1); }
 // CHECK-LABEL: @xvslei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslei_d(v4i64 _1) { return __lasx_xvslei_d(_1, 1); }
 // CHECK-LABEL: @xvsle_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsle_bu(_1, _2); }
 // CHECK-LABEL: @xvsle_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsle_hu(_1, _2); }
 // CHECK-LABEL: @xvsle_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsle_wu(_1, _2); }
 // CHECK-LABEL: @xvsle_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __lasx_xvsle_du(_1, _2); }
 // CHECK-LABEL: @xvslei_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslei_bu(v32u8 _1) { return __lasx_xvslei_bu(_1, 1); }
 // CHECK-LABEL: @xvslei_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslei_hu(v16u16 _1) { return __lasx_xvslei_hu(_1, 1); }
 // CHECK-LABEL: @xvslei_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslei_wu(v8u32 _1) { return __lasx_xvslei_wu(_1, 1); }
 // CHECK-LABEL: @xvslei_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslei_du(v4u64 _1) { return __lasx_xvslei_du(_1, 1); }
 // CHECK-LABEL: @xvsat_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsat_b(v32i8 _1) { return __lasx_xvsat_b(_1, 1); }
 // CHECK-LABEL: @xvsat_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsat_h(v16i16 _1) { return __lasx_xvsat_h(_1, 1); }
 // CHECK-LABEL: @xvsat_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsat_w(v8i32 _1) { return __lasx_xvsat_w(_1, 1); }
 // CHECK-LABEL: @xvsat_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsat_d(v4i64 _1) { return __lasx_xvsat_d(_1, 1); }
 // CHECK-LABEL: @xvsat_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvsat_bu(v32u8 _1) { return __lasx_xvsat_bu(_1, 1); }
 // CHECK-LABEL: @xvsat_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsat_hu(v16u16 _1) { return __lasx_xvsat_hu(_1, 1); }
 // CHECK-LABEL: @xvsat_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsat_wu(v8u32 _1) { return __lasx_xvsat_wu(_1, 1); }
 // CHECK-LABEL: @xvsat_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsat_du(v4u64 _1) { return __lasx_xvsat_du(_1, 1); }
 // CHECK-LABEL: @xvadda_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __lasx_xvadda_b(_1, _2); }
 // CHECK-LABEL: @xvadda_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __lasx_xvadda_h(_1, _2); }
 // CHECK-LABEL: @xvadda_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __lasx_xvadda_w(_1, _2); }
 // CHECK-LABEL: @xvadda_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __lasx_xvadda_d(_1, _2); }
 // CHECK-LABEL: @xvsadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvsadd_b(_1, _2); }
 // CHECK-LABEL: @xvsadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvsadd_h(_1, _2); }
 // CHECK-LABEL: @xvsadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvsadd_w(_1, _2); }
 // CHECK-LABEL: @xvsadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvsadd_d(_1, _2); }
 // CHECK-LABEL: @xvsadd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsadd_bu(_1, _2); }
 // CHECK-LABEL: @xvsadd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsadd_hu(_1, _2); }
 // CHECK-LABEL: @xvsadd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsadd_wu(_1, _2); }
 // CHECK-LABEL: @xvsadd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __lasx_xvsadd_du(_1, _2); }
 // CHECK-LABEL: @xvavg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __lasx_xvavg_b(_1, _2); }
 // CHECK-LABEL: @xvavg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __lasx_xvavg_h(_1, _2); }
 // CHECK-LABEL: @xvavg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __lasx_xvavg_w(_1, _2); }
 // CHECK-LABEL: @xvavg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __lasx_xvavg_d(_1, _2); }
 // CHECK-LABEL: @xvavg_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavg_bu(_1, _2); }
 // CHECK-LABEL: @xvavg_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavg_hu(_1, _2); }
 // CHECK-LABEL: @xvavg_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavg_wu(_1, _2); }
 // CHECK-LABEL: @xvavg_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __lasx_xvavg_du(_1, _2); }
 // CHECK-LABEL: @xvavgr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __lasx_xvavgr_b(_1, _2); }
 // CHECK-LABEL: @xvavgr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __lasx_xvavgr_h(_1, _2); }
 // CHECK-LABEL: @xvavgr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __lasx_xvavgr_w(_1, _2); }
 // CHECK-LABEL: @xvavgr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __lasx_xvavgr_d(_1, _2); }
 // CHECK-LABEL: @xvavgr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavgr_bu(_1, _2); }
 // CHECK-LABEL: @xvavgr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavgr_hu(_1, _2); }
 // CHECK-LABEL: @xvavgr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavgr_wu(_1, _2); }
 // CHECK-LABEL: @xvavgr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __lasx_xvavgr_du(_1, _2); }
 // CHECK-LABEL: @xvssub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __lasx_xvssub_b(_1, _2); }
 // CHECK-LABEL: @xvssub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __lasx_xvssub_h(_1, _2); }
 // CHECK-LABEL: @xvssub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __lasx_xvssub_w(_1, _2); }
 // CHECK-LABEL: @xvssub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __lasx_xvssub_d(_1, _2); }
 // CHECK-LABEL: @xvssub_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __lasx_xvssub_bu(_1, _2); }
 // CHECK-LABEL: @xvssub_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __lasx_xvssub_hu(_1, _2); }
 // CHECK-LABEL: @xvssub_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __lasx_xvssub_wu(_1, _2); }
 // CHECK-LABEL: @xvssub_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __lasx_xvssub_du(_1, _2); }
 // CHECK-LABEL: @xvabsd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __lasx_xvabsd_b(_1, _2); }
 // CHECK-LABEL: @xvabsd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __lasx_xvabsd_h(_1, _2); }
 // CHECK-LABEL: @xvabsd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __lasx_xvabsd_w(_1, _2); }
 // CHECK-LABEL: @xvabsd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __lasx_xvabsd_d(_1, _2); }
 // CHECK-LABEL: @xvabsd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvabsd_bu(_1, _2); }
 // CHECK-LABEL: @xvabsd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvabsd_hu(_1, _2); }
 // CHECK-LABEL: @xvabsd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvabsd_wu(_1, _2); }
 // CHECK-LABEL: @xvabsd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __lasx_xvabsd_du(_1, _2); }
 // CHECK-LABEL: @xvmul_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __lasx_xvmul_b(_1, _2); }
 // CHECK-LABEL: @xvmul_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __lasx_xvmul_h(_1, _2); }
 // CHECK-LABEL: @xvmul_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __lasx_xvmul_w(_1, _2); }
 // CHECK-LABEL: @xvmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __lasx_xvmul_d(_1, _2); }
 // CHECK-LABEL: @xvmadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmadd_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmadd_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmadd_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmsub_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmsub_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmsub_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvdiv_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __lasx_xvdiv_b(_1, _2); }
 // CHECK-LABEL: @xvdiv_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __lasx_xvdiv_h(_1, _2); }
 // CHECK-LABEL: @xvdiv_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __lasx_xvdiv_w(_1, _2); }
 // CHECK-LABEL: @xvdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __lasx_xvdiv_d(_1, _2); }
 // CHECK-LABEL: @xvdiv_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __lasx_xvdiv_bu(_1, _2); }
 // CHECK-LABEL: @xvdiv_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __lasx_xvdiv_hu(_1, _2); }
 // CHECK-LABEL: @xvdiv_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __lasx_xvdiv_wu(_1, _2); }
 // CHECK-LABEL: @xvdiv_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __lasx_xvdiv_du(_1, _2); }
 // CHECK-LABEL: @xvhaddw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhaddw_h_b(_1, _2); }
 // CHECK-LABEL: @xvhaddw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhaddw_w_h(_1, _2); }
 // CHECK-LABEL: @xvhaddw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhaddw_d_w(_1, _2); }
 // CHECK-LABEL: @xvhaddw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhaddw_hu_bu(_1, _2); }
 // CHECK-LABEL: @xvhaddw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhaddw_wu_hu(_1, _2); }
 // CHECK-LABEL: @xvhaddw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhaddw_du_wu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhsubw_h_b(_1, _2); }
 // CHECK-LABEL: @xvhsubw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhsubw_w_h(_1, _2); }
 // CHECK-LABEL: @xvhsubw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhsubw_d_w(_1, _2); }
 // CHECK-LABEL: @xvhsubw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhsubw_hu_bu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhsubw_wu_hu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhsubw_du_wu(_1, _2); }
 // CHECK-LABEL: @xvmod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __lasx_xvmod_b(_1, _2); }
 // CHECK-LABEL: @xvmod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __lasx_xvmod_h(_1, _2); }
 // CHECK-LABEL: @xvmod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __lasx_xvmod_w(_1, _2); }
 // CHECK-LABEL: @xvmod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __lasx_xvmod_d(_1, _2); }
 // CHECK-LABEL: @xvmod_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmod_bu(_1, _2); }
 // CHECK-LABEL: @xvmod_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmod_hu(_1, _2); }
 // CHECK-LABEL: @xvmod_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmod_wu(_1, _2); }
 // CHECK-LABEL: @xvmod_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __lasx_xvmod_du(_1, _2); }
 // CHECK-LABEL: @xvrepl128vei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrepl128vei_b(v32i8 _1) { return __lasx_xvrepl128vei_b(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrepl128vei_h(v16i16 _1) { return __lasx_xvrepl128vei_h(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrepl128vei_w(v8i32 _1) { return __lasx_xvrepl128vei_w(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrepl128vei_d(v4i64 _1) { return __lasx_xvrepl128vei_d(_1, 1); }
 // CHECK-LABEL: @xvpickev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickev_b(_1, _2); }
 // CHECK-LABEL: @xvpickev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickev_h(_1, _2); }
 // CHECK-LABEL: @xvpickev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickev_w(_1, _2); }
 // CHECK-LABEL: @xvpickev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickev_d(_1, _2); }
 // CHECK-LABEL: @xvpickod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickod_b(_1, _2); }
 // CHECK-LABEL: @xvpickod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickod_h(_1, _2); }
 // CHECK-LABEL: @xvpickod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickod_w(_1, _2); }
 // CHECK-LABEL: @xvpickod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickod_d(_1, _2); }
 // CHECK-LABEL: @xvilvh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvh_b(_1, _2); }
 // CHECK-LABEL: @xvilvh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvh_h(_1, _2); }
 // CHECK-LABEL: @xvilvh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvh_w(_1, _2); }
 // CHECK-LABEL: @xvilvh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvh_d(_1, _2); }
 // CHECK-LABEL: @xvilvl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvl_b(_1, _2); }
 // CHECK-LABEL: @xvilvl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvl_h(_1, _2); }
 // CHECK-LABEL: @xvilvl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvl_w(_1, _2); }
 // CHECK-LABEL: @xvilvl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvl_d(_1, _2); }
 // CHECK-LABEL: @xvpackev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackev_b(_1, _2); }
 // CHECK-LABEL: @xvpackev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackev_h(_1, _2); }
 // CHECK-LABEL: @xvpackev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackev_w(_1, _2); }
 // CHECK-LABEL: @xvpackev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackev_d(_1, _2); }
 // CHECK-LABEL: @xvpackod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackod_b(_1, _2); }
 // CHECK-LABEL: @xvpackod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackod_h(_1, _2); }
 // CHECK-LABEL: @xvpackod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackod_w(_1, _2); }
 // CHECK-LABEL: @xvpackod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackod_d(_1, _2); }
 // CHECK-LABEL: @xvshuf_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvshuf_b(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvshuf_h(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvshuf_w(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvshuf_d(_1, _2, _3); }
 // CHECK-LABEL: @xvand_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __lasx_xvand_v(_1, _2); }
 // CHECK-LABEL: @xvandi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvandi_b(v32u8 _1) { return __lasx_xvandi_b(_1, 1); }
 // CHECK-LABEL: @xvor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __lasx_xvor_v(_1, _2); }
 // CHECK-LABEL: @xvori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvori_b(v32u8 _1) { return __lasx_xvori_b(_1, 1); }
 // CHECK-LABEL: @xvnor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __lasx_xvnor_v(_1, _2); }
 // CHECK-LABEL: @xvnori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvnori_b(v32u8 _1) { return __lasx_xvnori_b(_1, 1); }
 // CHECK-LABEL: @xvxor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __lasx_xvxor_v(_1, _2); }
 // CHECK-LABEL: @xvxori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvxori_b(v32u8 _1) { return __lasx_xvxori_b(_1, 1); }
 // CHECK-LABEL: @xvbitsel_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __lasx_xvbitsel_v(_1, _2, _3); }
 // CHECK-LABEL: @xvbitseli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitseli_b(_1, _2, 1); }
 // CHECK-LABEL: @xvshuf4i_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvshuf4i_b(v32i8 _1) { return __lasx_xvshuf4i_b(_1, 1); }
 // CHECK-LABEL: @xvshuf4i_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvshuf4i_h(v16i16 _1) { return __lasx_xvshuf4i_h(_1, 1); }
 // CHECK-LABEL: @xvshuf4i_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvshuf4i_w(v8i32 _1) { return __lasx_xvshuf4i_w(_1, 1); }
 // CHECK-LABEL: @xvreplgr2vr_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplgr2vr_b(int _1) { return __lasx_xvreplgr2vr_b(_1); }
 // CHECK-LABEL: @xvreplgr2vr_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplgr2vr_h(int _1) { return __lasx_xvreplgr2vr_h(_1); }
 // CHECK-LABEL: @xvreplgr2vr_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplgr2vr_w(int _1) { return __lasx_xvreplgr2vr_w(_1); }
 // CHECK-LABEL: @xvreplgr2vr_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplgr2vr_d(int _1) { return __lasx_xvreplgr2vr_d(_1); }
 // CHECK-LABEL: @xvpcnt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpcnt_b(v32i8 _1) { return __lasx_xvpcnt_b(_1); }
 // CHECK-LABEL: @xvpcnt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpcnt_h(v16i16 _1) { return __lasx_xvpcnt_h(_1); }
 // CHECK-LABEL: @xvpcnt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpcnt_w(v8i32 _1) { return __lasx_xvpcnt_w(_1); }
 // CHECK-LABEL: @xvpcnt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpcnt_d(v4i64 _1) { return __lasx_xvpcnt_d(_1); }
 // CHECK-LABEL: @xvclo_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvclo_b(v32i8 _1) { return __lasx_xvclo_b(_1); }
 // CHECK-LABEL: @xvclo_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvclo_h(v16i16 _1) { return __lasx_xvclo_h(_1); }
 // CHECK-LABEL: @xvclo_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvclo_w(v8i32 _1) { return __lasx_xvclo_w(_1); }
 // CHECK-LABEL: @xvclo_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvclo_d(v4i64 _1) { return __lasx_xvclo_d(_1); }
 // CHECK-LABEL: @xvclz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvclz_b(v32i8 _1) { return __lasx_xvclz_b(_1); }
 // CHECK-LABEL: @xvclz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvclz_h(v16i16 _1) { return __lasx_xvclz_h(_1); }
 // CHECK-LABEL: @xvclz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvclz_w(v8i32 _1) { return __lasx_xvclz_w(_1); }
 // CHECK-LABEL: @xvclz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvclz_d(v4i64 _1) { return __lasx_xvclz_d(_1); }
 // CHECK-LABEL: @xvfadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __lasx_xvfadd_s(_1, _2); }
 // CHECK-LABEL: @xvfadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __lasx_xvfadd_d(_1, _2); }
 // CHECK-LABEL: @xvfsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __lasx_xvfsub_s(_1, _2); }
 // CHECK-LABEL: @xvfsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __lasx_xvfsub_d(_1, _2); }
 // CHECK-LABEL: @xvfmul_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmul_s(_1, _2); }
 // CHECK-LABEL: @xvfmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmul_d(_1, _2); }
 // CHECK-LABEL: @xvfdiv_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __lasx_xvfdiv_s(_1, _2); }
 // CHECK-LABEL: @xvfdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __lasx_xvfdiv_d(_1, _2); }
 // CHECK-LABEL: @xvfcvt_h_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcvt_h_s(_1, _2); }
 // CHECK-LABEL: @xvfcvt_s_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcvt_s_d(_1, _2); }
 // CHECK-LABEL: @xvfmin_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmin_s(_1, _2); }
 // CHECK-LABEL: @xvfmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmin_d(_1, _2); }
 // CHECK-LABEL: @xvfmina_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmina_s(_1, _2); }
 // CHECK-LABEL: @xvfmina_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmina_d(_1, _2); }
 // CHECK-LABEL: @xvfmax_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmax_s(_1, _2); }
 // CHECK-LABEL: @xvfmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmax_d(_1, _2); }
 // CHECK-LABEL: @xvfmaxa_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmaxa_s(_1, _2); }
 // CHECK-LABEL: @xvfmaxa_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmaxa_d(_1, _2); }
 // CHECK-LABEL: @xvfclass_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfclass_s(v8f32 _1) { return __lasx_xvfclass_s(_1); }
 // CHECK-LABEL: @xvfclass_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfclass_d(v4f64 _1) { return __lasx_xvfclass_d(_1); }
 // CHECK-LABEL: @xvfsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfsqrt_s(v8f32 _1) { return __lasx_xvfsqrt_s(_1); }
 // CHECK-LABEL: @xvfsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfsqrt_d(v4f64 _1) { return __lasx_xvfsqrt_d(_1); }
 // CHECK-LABEL: @xvfrecip_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrecip_s(v8f32 _1) { return __lasx_xvfrecip_s(_1); }
 // CHECK-LABEL: @xvfrecip_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrecip_d(v4f64 _1) { return __lasx_xvfrecip_d(_1); }
 // CHECK-LABEL: @xvfrint_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrint_s(v8f32 _1) { return __lasx_xvfrint_s(_1); }
 // CHECK-LABEL: @xvfrint_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrint_d(v4f64 _1) { return __lasx_xvfrint_d(_1); }
 // CHECK-LABEL: @xvfrsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrt_s(v8f32 _1) { return __lasx_xvfrsqrt_s(_1); }
 // CHECK-LABEL: @xvfrsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrt_d(v4f64 _1) { return __lasx_xvfrsqrt_d(_1); }
 // CHECK-LABEL: @xvflogb_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvflogb_s(v8f32 _1) { return __lasx_xvflogb_s(_1); }
 // CHECK-LABEL: @xvflogb_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvflogb_d(v4f64 _1) { return __lasx_xvflogb_d(_1); }
 // CHECK-LABEL: @xvfcvth_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvth_s_h(v16i16 _1) { return __lasx_xvfcvth_s_h(_1); }
 // CHECK-LABEL: @xvfcvth_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfcvth_d_s(v8f32 _1) { return __lasx_xvfcvth_d_s(_1); }
 // CHECK-LABEL: @xvfcvtl_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvtl_s_h(v16i16 _1) { return __lasx_xvfcvtl_s_h(_1); }
 // CHECK-LABEL: @xvfcvtl_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfcvtl_d_s(v8f32 _1) { return __lasx_xvfcvtl_d_s(_1); }
 // CHECK-LABEL: @xvftint_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_s(v8f32 _1) { return __lasx_xvftint_w_s(_1); }
 // CHECK-LABEL: @xvftint_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftint_l_d(v4f64 _1) { return __lasx_xvftint_l_d(_1); }
 // CHECK-LABEL: @xvftint_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvftint_wu_s(v8f32 _1) { return __lasx_xvftint_wu_s(_1); }
 // CHECK-LABEL: @xvftint_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvftint_lu_d(v4f64 _1) { return __lasx_xvftint_lu_d(_1); }
 // CHECK-LABEL: @xvftintrz_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_s(v8f32 _1) { return __lasx_xvftintrz_w_s(_1); }
 // CHECK-LABEL: @xvftintrz_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrz_l_d(v4f64 _1) { return __lasx_xvftintrz_l_d(_1); }
 // CHECK-LABEL: @xvftintrz_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvftintrz_wu_s(v8f32 _1) { return __lasx_xvftintrz_wu_s(_1); }
 // CHECK-LABEL: @xvftintrz_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvftintrz_lu_d(v4f64 _1) { return __lasx_xvftintrz_lu_d(_1); }
 // CHECK-LABEL: @xvffint_s_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_w(v8i32 _1) { return __lasx_xvffint_s_w(_1); }
 // CHECK-LABEL: @xvffint_d_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_l(v4i64 _1) { return __lasx_xvffint_d_l(_1); }
 // CHECK-LABEL: @xvffint_s_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_wu(v8u32 _1) { return __lasx_xvffint_s_wu(_1); }
 // CHECK-LABEL: @xvffint_d_lu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_lu(v4u64 _1) { return __lasx_xvffint_d_lu(_1); }
 // CHECK-LABEL: @xvreplve_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_112]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve_b(v32i8 _1, int _2) { return __lasx_xvreplve_b(_1, _2); }
 // CHECK-LABEL: @xvreplve_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_112]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplve_h(v16i16 _1, int _2) { return __lasx_xvreplve_h(_1, _2); }
 // CHECK-LABEL: @xvreplve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_112]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplve_w(v8i32 _1, int _2) { return __lasx_xvreplve_w(_1, _2); }
 // CHECK-LABEL: @xvreplve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplve_d(v4i64 _1, int _2) { return __lasx_xvreplve_d(_1, _2); }
 // CHECK-LABEL: @xvpermi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __lasx_xvpermi_w(_1, _2, 1); }
 // CHECK-LABEL: @xvandn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __lasx_xvandn_v(_1, _2); }
 // CHECK-LABEL: @xvneg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvneg_b(v32i8 _1) { return __lasx_xvneg_b(_1); }
 // CHECK-LABEL: @xvneg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvneg_h(v16i16 _1) { return __lasx_xvneg_h(_1); }
 // CHECK-LABEL: @xvneg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvneg_w(v8i32 _1) { return __lasx_xvneg_w(_1); }
 // CHECK-LABEL: @xvneg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvneg_d(v4i64 _1) { return __lasx_xvneg_d(_1); }
 // CHECK-LABEL: @xvmuh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __lasx_xvmuh_b(_1, _2); }
 // CHECK-LABEL: @xvmuh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __lasx_xvmuh_h(_1, _2); }
 // CHECK-LABEL: @xvmuh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __lasx_xvmuh_w(_1, _2); }
 // CHECK-LABEL: @xvmuh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __lasx_xvmuh_d(_1, _2); }
 // CHECK-LABEL: @xvmuh_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmuh_bu(_1, _2); }
 // CHECK-LABEL: @xvmuh_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmuh_hu(_1, _2); }
 // CHECK-LABEL: @xvmuh_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmuh_wu(_1, _2); }
 // CHECK-LABEL: @xvmuh_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __lasx_xvmuh_du(_1, _2); }
 // CHECK-LABEL: @xvsllwil_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsllwil_h_b(v32i8 _1) { return __lasx_xvsllwil_h_b(_1, 1); }
 // CHECK-LABEL: @xvsllwil_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsllwil_w_h(v16i16 _1) { return __lasx_xvsllwil_w_h(_1, 1); }
 // CHECK-LABEL: @xvsllwil_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsllwil_d_w(v8i32 _1) { return __lasx_xvsllwil_d_w(_1, 1); }
 // CHECK-LABEL: @xvsllwil_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsllwil_hu_bu(v32u8 _1) { return __lasx_xvsllwil_hu_bu(_1, 1); }
 // CHECK-LABEL: @xvsllwil_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsllwil_wu_hu(v16u16 _1) { return __lasx_xvsllwil_wu_hu(_1, 1); }
 // CHECK-LABEL: @xvsllwil_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsllwil_du_wu(v8u32 _1) { return __lasx_xvsllwil_du_wu(_1, 1); }
 // CHECK-LABEL: @xvsran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsran_b_h(_1, _2); }
 // CHECK-LABEL: @xvsran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsran_h_w(_1, _2); }
 // CHECK-LABEL: @xvsran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsran_w_d(_1, _2); }
 // CHECK-LABEL: @xvssran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssran_b_h(_1, _2); }
 // CHECK-LABEL: @xvssran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssran_h_w(_1, _2); }
 // CHECK-LABEL: @xvssran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssran_w_d(_1, _2); }
 // CHECK-LABEL: @xvssran_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssran_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssran_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssran_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssran_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssran_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrarn_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrarn_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrarn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrarn_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrarn_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrarn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrarn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrarn_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrarn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrarn_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrarn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrarn_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrln_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrln_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrln_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrln_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrln_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrln_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrln_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrln_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrln_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrlrn_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrlrn_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrlrn_wu_d(_1, _2); }
 // CHECK-LABEL: @xvfrstpi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __lasx_xvfrstpi_b(_1, _2, 1); }
 // CHECK-LABEL: @xvfrstpi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __lasx_xvfrstpi_h(_1, _2, 1); }
 // CHECK-LABEL: @xvfrstp_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvfrstp_b(_1, _2, _3); }
 // CHECK-LABEL: @xvfrstp_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvfrstp_h(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf4i_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __lasx_xvshuf4i_d(_1, _2, 1); }
 // CHECK-LABEL: @xvbsrl_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvbsrl_v(v32i8 _1) { return __lasx_xvbsrl_v(_1, 1); }
 // CHECK-LABEL: @xvbsll_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvbsll_v(v32i8 _1) { return __lasx_xvbsll_v(_1, 1); }
 // CHECK-LABEL: @xvextrins_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __lasx_xvextrins_b(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __lasx_xvextrins_h(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __lasx_xvextrins_w(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __lasx_xvextrins_d(_1, _2, 1); }
 // CHECK-LABEL: @xvmskltz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmskltz_b(v32i8 _1) { return __lasx_xvmskltz_b(_1); }
 // CHECK-LABEL: @xvmskltz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmskltz_h(v16i16 _1) { return __lasx_xvmskltz_h(_1); }
 // CHECK-LABEL: @xvmskltz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmskltz_w(v8i32 _1) { return __lasx_xvmskltz_w(_1); }
 // CHECK-LABEL: @xvmskltz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmskltz_d(v4i64 _1) { return __lasx_xvmskltz_d(_1); }
 // CHECK-LABEL: @xvsigncov_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __lasx_xvsigncov_b(_1, _2); }
 // CHECK-LABEL: @xvsigncov_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __lasx_xvsigncov_h(_1, _2); }
 // CHECK-LABEL: @xvsigncov_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __lasx_xvsigncov_w(_1, _2); }
 // CHECK-LABEL: @xvsigncov_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __lasx_xvsigncov_d(_1, _2); }
 // CHECK-LABEL: @xvfmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmadd_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmsub_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmadd_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmsub_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvftintrne_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_s(v8f32 _1) { return __lasx_xvftintrne_w_s(_1); }
 // CHECK-LABEL: @xvftintrne_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrne_l_d(v4f64 _1) { return __lasx_xvftintrne_l_d(_1); }
 // CHECK-LABEL: @xvftintrp_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_s(v8f32 _1) { return __lasx_xvftintrp_w_s(_1); }
 // CHECK-LABEL: @xvftintrp_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrp_l_d(v4f64 _1) { return __lasx_xvftintrp_l_d(_1); }
 // CHECK-LABEL: @xvftintrm_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_s(v8f32 _1) { return __lasx_xvftintrm_w_s(_1); }
 // CHECK-LABEL: @xvftintrm_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrm_l_d(v4f64 _1) { return __lasx_xvftintrm_l_d(_1); }
 // CHECK-LABEL: @xvftint_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftint_w_d(_1, _2); }
 // CHECK-LABEL: @xvffint_s_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __lasx_xvffint_s_l(_1, _2); }
 // CHECK-LABEL: @xvftintrz_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrz_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrp_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrp_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrm_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrm_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrne_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrne_w_d(_1, _2); }
 // CHECK-LABEL: @xvftinth_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftinth_l_s(v8f32 _1) { return __lasx_xvftinth_l_s(_1); }
 // CHECK-LABEL: @xvftintl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintl_l_s(v8f32 _1) { return __lasx_xvftintl_l_s(_1); }
 // CHECK-LABEL: @xvffinth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffinth_d_w(v8i32 _1) { return __lasx_xvffinth_d_w(_1); }
 // CHECK-LABEL: @xvffintl_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffintl_d_w(v8i32 _1) { return __lasx_xvffintl_d_w(_1); }
 // CHECK-LABEL: @xvftintrzh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzh_l_s(v8f32 _1) { return __lasx_xvftintrzh_l_s(_1); }
 // CHECK-LABEL: @xvftintrzl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzl_l_s(v8f32 _1) { return __lasx_xvftintrzl_l_s(_1); }
 // CHECK-LABEL: @xvftintrph_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrph_l_s(v8f32 _1) { return __lasx_xvftintrph_l_s(_1); }
 // CHECK-LABEL: @xvftintrpl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrpl_l_s(v8f32 _1) { return __lasx_xvftintrpl_l_s(_1); }
 // CHECK-LABEL: @xvftintrmh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrmh_l_s(v8f32 _1) { return __lasx_xvftintrmh_l_s(_1); }
 // CHECK-LABEL: @xvftintrml_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrml_l_s(v8f32 _1) { return __lasx_xvftintrml_l_s(_1); }
 // CHECK-LABEL: @xvftintrneh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrneh_l_s(v8f32 _1) { return __lasx_xvftintrneh_l_s(_1); }
 // CHECK-LABEL: @xvftintrnel_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrnel_l_s(v8f32 _1) { return __lasx_xvftintrnel_l_s(_1); }
 // CHECK-LABEL: @xvfrintrne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrne_s(v8f32 _1) { return __lasx_xvfrintrne_s(_1); }
 // CHECK-LABEL: @xvfrintrne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrne_d(v4f64 _1) { return __lasx_xvfrintrne_d(_1); }
 // CHECK-LABEL: @xvfrintrz_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrz_s(v8f32 _1) { return __lasx_xvfrintrz_s(_1); }
 // CHECK-LABEL: @xvfrintrz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrz_d(v4f64 _1) { return __lasx_xvfrintrz_d(_1); }
 // CHECK-LABEL: @xvfrintrp_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrp_s(v8f32 _1) { return __lasx_xvfrintrp_s(_1); }
 // CHECK-LABEL: @xvfrintrp_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrp_d(v4f64 _1) { return __lasx_xvfrintrp_d(_1); }
 // CHECK-LABEL: @xvfrintrm_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrm_s(v8f32 _1) { return __lasx_xvfrintrm_s(_1); }
 // CHECK-LABEL: @xvfrintrm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrm_d(v4f64 _1) { return __lasx_xvfrintrm_d(_1); }
 // CHECK-LABEL: @xvld(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvld(void * _1) { return __lasx_xvld(_1, 1); }
 // CHECK-LABEL: @xvst(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvst(v32i8 _1, void * _2) { return __lasx_xvst(_1, _2, 1); }
 // CHECK-LABEL: @xvstelm_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_b(v32i8 _1, void * _2) { return __lasx_xvstelm_b(_1, _2, 1, 1); }
 // CHECK-LABEL: @xvstelm_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2:%.*]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_h(v16i16 _1, void * _2) { return __lasx_xvstelm_h(_1, _2, 2, 1); }
 // CHECK-LABEL: @xvstelm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2:%.*]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_w(v8i32 _1, void * _2) { return __lasx_xvstelm_w(_1, _2, 4, 1); }
 // CHECK-LABEL: @xvstelm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2:%.*]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_d(v4i64 _1, void * _2) { return __lasx_xvstelm_d(_1, _2, 8, 1); }
 // CHECK-LABEL: @xvinsve0_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __lasx_xvinsve0_w(_1, _2, 1); }
 // CHECK-LABEL: @xvinsve0_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __lasx_xvinsve0_d(_1, _2, 1); }
 // CHECK-LABEL: @xvpickve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickve_w(v8i32 _1) { return __lasx_xvpickve_w(_1, 1); }
 // CHECK-LABEL: @xvpickve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickve_d(v4i64 _1) { return __lasx_xvpickve_d(_1, 1); }
 // CHECK-LABEL: @xvssrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrln_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrln_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrln_w_d(_1, _2); }
 // CHECK-LABEL: @xvorn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __lasx_xvorn_v(_1, _2); }
 // CHECK-LABEL: @xvldi(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvldi() { return __lasx_xvldi(1); }
 // CHECK-LABEL: @xvldx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1), !noalias [[META5:![0-9]+]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvldx(void * _1) { return __lasx_xvldx(_1, 1); }
 // CHECK-LABEL: @xvstx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_112]], ptr [[_2:%.*]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void xvstx(v32i8 _1, void * _2) { return __lasx_xvstx(_1, _2, 1); }
 // CHECK-LABEL: @xvextl_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvextl_qu_du(v4u64 _1) { return __lasx_xvextl_qu_du(_1); }
 // CHECK-LABEL: @xvinsgr2vr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1]], i32 1, i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvinsgr2vr_w(v8i32 _1) { return __lasx_xvinsgr2vr_w(_1, 1, 1); }
 // CHECK-LABEL: @xvinsgr2vr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1:%.*]], i64 1, i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1]], i64 1, i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvinsgr2vr_d(v4i64 _1) { return __lasx_xvinsgr2vr_d(_1, 1, 1); }
 // CHECK-LABEL: @xvreplve0_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_b(v32i8 _1) { return __lasx_xvreplve0_b(_1); }
 // CHECK-LABEL: @xvreplve0_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplve0_h(v16i16 _1) { return __lasx_xvreplve0_h(_1); }
 // CHECK-LABEL: @xvreplve0_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplve0_w(v8i32 _1) { return __lasx_xvreplve0_w(_1); }
 // CHECK-LABEL: @xvreplve0_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplve0_d(v4i64 _1) { return __lasx_xvreplve0_d(_1); }
 // CHECK-LABEL: @xvreplve0_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_q(v32i8 _1) { return __lasx_xvreplve0_q(_1); }
 // CHECK-LABEL: @vext2xv_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_h_b(v32i8 _1) { return __lasx_vext2xv_h_b(_1); }
 // CHECK-LABEL: @vext2xv_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_h(v16i16 _1) { return __lasx_vext2xv_w_h(_1); }
 // CHECK-LABEL: @vext2xv_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_w(v8i32 _1) { return __lasx_vext2xv_d_w(_1); }
 // CHECK-LABEL: @vext2xv_w_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_b(v32i8 _1) { return __lasx_vext2xv_w_b(_1); }
 // CHECK-LABEL: @vext2xv_d_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_h(v16i16 _1) { return __lasx_vext2xv_d_h(_1); }
 // CHECK-LABEL: @vext2xv_d_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_b(v32i8 _1) { return __lasx_vext2xv_d_b(_1); }
 // CHECK-LABEL: @vext2xv_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_hu_bu(v32i8 _1) { return __lasx_vext2xv_hu_bu(_1); }
 // CHECK-LABEL: @vext2xv_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_hu(v16i16 _1) { return __lasx_vext2xv_wu_hu(_1); }
 // CHECK-LABEL: @vext2xv_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_wu(v8i32 _1) { return __lasx_vext2xv_du_wu(_1); }
 // CHECK-LABEL: @vext2xv_wu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_bu(v32i8 _1) { return __lasx_vext2xv_wu_bu(_1); }
 // CHECK-LABEL: @vext2xv_du_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_hu(v16i16 _1) { return __lasx_vext2xv_du_hu(_1); }
 // CHECK-LABEL: @vext2xv_du_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_bu(v32i8 _1) { return __lasx_vext2xv_du_bu(_1); }
 // CHECK-LABEL: @xvpermi_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __lasx_xvpermi_q(_1, _2, 1); }
 // CHECK-LABEL: @xvpermi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpermi_d(v4i64 _1) { return __lasx_xvpermi_d(_1, 1); }
 // CHECK-LABEL: @xvperm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __lasx_xvperm_w(_1, _2); }
 // CHECK-LABEL: @xvldrepl_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvldrepl_b(void * _1) { return __lasx_xvldrepl_b(_1, 1); }
 // CHECK-LABEL: @xvldrepl_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvldrepl_h(void * _1) { return __lasx_xvldrepl_h(_1, 2); }
 // CHECK-LABEL: @xvldrepl_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvldrepl_w(void * _1) { return __lasx_xvldrepl_w(_1, 4); }
 // CHECK-LABEL: @xvldrepl_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvldrepl_d(void * _1) { return __lasx_xvldrepl_d(_1, 8); }
 // CHECK-LABEL: @xvpickve2gr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xvpickve2gr_w(v8i32 _1) { return __lasx_xvpickve2gr_w(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int xvpickve2gr_wu(v8i32 _1) { return __lasx_xvpickve2gr_wu(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long xvpickve2gr_d(v4i64 _1) { return __lasx_xvpickve2gr_d(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int xvpickve2gr_du(v4i64 _1) { return __lasx_xvpickve2gr_du(_1, 1); }
 // CHECK-LABEL: @xvaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvsubwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvsubwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvsubwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvsubwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvsubwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvsubwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvsubwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvsubwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvsubwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvsubwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvhaddw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhaddw_q_d(_1, _2); }
 // CHECK-LABEL: @xvhaddw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhaddw_qu_du(_1, _2); }
 // CHECK-LABEL: @xvhsubw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhsubw_q_d(_1, _2); }
 // CHECK-LABEL: @xvhsubw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhsubw_qu_du(_1, _2); }
 // CHECK-LABEL: @xvmaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwev_q_du(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwev_d_wu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwev_w_hu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwev_h_bu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwod_q_du(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwod_d_wu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwod_w_hu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwod_h_bu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: @xvrotr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __lasx_xvrotr_b(_1, _2); }
 // CHECK-LABEL: @xvrotr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __lasx_xvrotr_h(_1, _2); }
 // CHECK-LABEL: @xvrotr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __lasx_xvrotr_w(_1, _2); }
 // CHECK-LABEL: @xvrotr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __lasx_xvrotr_d(_1, _2); }
 // CHECK-LABEL: @xvadd_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __lasx_xvadd_q(_1, _2); }
 // CHECK-LABEL: @xvsub_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __lasx_xvsub_q(_1, _2); }
 // CHECK-LABEL: @xvaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwev_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwod_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwev_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwod_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmskgez_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmskgez_b(v32i8 _1) { return __lasx_xvmskgez_b(_1); }
 // CHECK-LABEL: @xvmsknz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmsknz_b(v32i8 _1) { return __lasx_xvmsknz_b(_1); }
 // CHECK-LABEL: @xvexth_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvexth_h_b(v32i8 _1) { return __lasx_xvexth_h_b(_1); }
 // CHECK-LABEL: @xvexth_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvexth_w_h(v16i16 _1) { return __lasx_xvexth_w_h(_1); }
 // CHECK-LABEL: @xvexth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvexth_d_w(v8i32 _1) { return __lasx_xvexth_d_w(_1); }
 // CHECK-LABEL: @xvexth_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvexth_q_d(v4i64 _1) { return __lasx_xvexth_q_d(_1); }
 // CHECK-LABEL: @xvexth_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvexth_hu_bu(v32u8 _1) { return __lasx_xvexth_hu_bu(_1); }
 // CHECK-LABEL: @xvexth_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvexth_wu_hu(v16u16 _1) { return __lasx_xvexth_wu_hu(_1); }
 // CHECK-LABEL: @xvexth_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvexth_du_wu(v8u32 _1) { return __lasx_xvexth_du_wu(_1); }
 // CHECK-LABEL: @xvexth_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvexth_qu_du(v4u64 _1) { return __lasx_xvexth_qu_du(_1); }
 // CHECK-LABEL: @xvrotri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrotri_b(v32i8 _1) { return __lasx_xvrotri_b(_1, 1); }
 // CHECK-LABEL: @xvrotri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrotri_h(v16i16 _1) { return __lasx_xvrotri_h(_1, 1); }
 // CHECK-LABEL: @xvrotri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrotri_w(v8i32 _1) { return __lasx_xvrotri_w(_1, 1); }
 // CHECK-LABEL: @xvrotri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrotri_d(v4i64 _1) { return __lasx_xvrotri_d(_1, 1); }
 // CHECK-LABEL: @xvextl_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvextl_q_d(v4i64 _1) { return __lasx_xvextl_q_d(_1); }
 // CHECK-LABEL: @xvsrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlrni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlrni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlrni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlrni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrani_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrani_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrani_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrani_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrarni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrarni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrarni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrarni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xbnz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_b(v32u8 _1) { return __lasx_xbnz_b(_1); }
 // CHECK-LABEL: @xbnz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_d(v4u64 _1) { return __lasx_xbnz_d(_1); }
 // CHECK-LABEL: @xbnz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_h(v16u16 _1) { return __lasx_xbnz_h(_1); }
 // CHECK-LABEL: @xbnz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_v(v32u8 _1) { return __lasx_xbnz_v(_1); }
 // CHECK-LABEL: @xbnz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_w(v8u32 _1) { return __lasx_xbnz_w(_1); }
 // CHECK-LABEL: @xbz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_b(v32u8 _1) { return __lasx_xbz_b(_1); }
 // CHECK-LABEL: @xbz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_d(v4u64 _1) { return __lasx_xbz_d(_1); }
 // CHECK-LABEL: @xbz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_h(v16u16 _1) { return __lasx_xbz_h(_1); }
 // CHECK-LABEL: @xbz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_v(v32u8 _1) { return __lasx_xbz_v(_1); }
 // CHECK-LABEL: @xbz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_w(v8u32 _1) { return __lasx_xbz_w(_1); }
 // CHECK-LABEL: @xvfcmp_caf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_caf_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_caf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_caf_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_ceq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_ceq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_ceq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_ceq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cle_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cle_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_clt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_clt_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_clt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_clt_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cne_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cne_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cor_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cor_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cueq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cueq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cule_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cule_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cult_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cult_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cun_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cune_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cune_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cun_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_saf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_saf_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_saf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_saf_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_seq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_seq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_seq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_seq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sle_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sle_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_slt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_slt_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_slt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_slt_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sne_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sne_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sor_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sor_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sueq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sueq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sule_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sule_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sult_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sult_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sun_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sune_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sune_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sun_s(_1, _2); }
 // CHECK-LABEL: @xvpickve_d_f(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvpickve_d_f(v4f64 _1) { return __lasx_xvpickve_d_f(_1, 1); }
 // CHECK-LABEL: @xvpickve_w_f(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvpickve_w_f(v8f32 _1) { return __lasx_xvpickve_w_f(_1, 1); }
 // CHECK-LABEL: @xvrepli_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrepli_b() { return __lasx_xvrepli_b(1); }
 // CHECK-LABEL: @xvrepli_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrepli_d() { return __lasx_xvrepli_d(1); }
 // CHECK-LABEL: @xvrepli_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
 // CHECK-LABEL: @xvrepli_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); }
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin.c b/clang/test/CodeGen/LoongArch/lasx/builtin.c
index 0185f2004d526..f52a23a5faea7 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin.c
@@ -27,4426 +27,6382 @@ typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
 
 // CHECK-LABEL: @xvsll_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsll_b(_1, _2); }
 // CHECK-LABEL: @xvsll_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsll_h(_1, _2); }
 // CHECK-LABEL: @xvsll_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsll_w(_1, _2); }
 // CHECK-LABEL: @xvsll_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsll_d(_1, _2); }
 // CHECK-LABEL: @xvslli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslli_b(v32i8 _1) { return __builtin_lasx_xvslli_b(_1, 1); }
 // CHECK-LABEL: @xvslli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslli_h(v16i16 _1) { return __builtin_lasx_xvslli_h(_1, 1); }
 // CHECK-LABEL: @xvslli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslli_w(v8i32 _1) { return __builtin_lasx_xvslli_w(_1, 1); }
 // CHECK-LABEL: @xvslli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslli_d(v4i64 _1) { return __builtin_lasx_xvslli_d(_1, 1); }
 // CHECK-LABEL: @xvsra_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsra_b(_1, _2); }
 // CHECK-LABEL: @xvsra_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsra_h(_1, _2); }
 // CHECK-LABEL: @xvsra_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsra_w(_1, _2); }
 // CHECK-LABEL: @xvsra_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsra_d(_1, _2); }
 // CHECK-LABEL: @xvsrai_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrai_b(v32i8 _1) { return __builtin_lasx_xvsrai_b(_1, 1); }
 // CHECK-LABEL: @xvsrai_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrai_h(v16i16 _1) { return __builtin_lasx_xvsrai_h(_1, 1); }
 // CHECK-LABEL: @xvsrai_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrai_w(v8i32 _1) { return __builtin_lasx_xvsrai_w(_1, 1); }
 // CHECK-LABEL: @xvsrai_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrai_d(v4i64 _1) { return __builtin_lasx_xvsrai_d(_1, 1); }
 // CHECK-LABEL: @xvsrar_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrar_b(_1, _2); }
 // CHECK-LABEL: @xvsrar_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrar_h(_1, _2); }
 // CHECK-LABEL: @xvsrar_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrar_w(_1, _2); }
 // CHECK-LABEL: @xvsrar_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrar_d(_1, _2); }
 // CHECK-LABEL: @xvsrari_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrari_b(v32i8 _1) { return __builtin_lasx_xvsrari_b(_1, 1); }
 // CHECK-LABEL: @xvsrari_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrari_h(v16i16 _1) { return __builtin_lasx_xvsrari_h(_1, 1); }
 // CHECK-LABEL: @xvsrari_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrari_w(v8i32 _1) { return __builtin_lasx_xvsrari_w(_1, 1); }
 // CHECK-LABEL: @xvsrari_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrari_d(v4i64 _1) { return __builtin_lasx_xvsrari_d(_1, 1); }
 // CHECK-LABEL: @xvsrl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrl_b(_1, _2); }
 // CHECK-LABEL: @xvsrl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrl_h(_1, _2); }
 // CHECK-LABEL: @xvsrl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrl_w(_1, _2); }
 // CHECK-LABEL: @xvsrl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrl_d(_1, _2); }
 // CHECK-LABEL: @xvsrli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrli_b(v32i8 _1) { return __builtin_lasx_xvsrli_b(_1, 1); }
 // CHECK-LABEL: @xvsrli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrli_h(v16i16 _1) { return __builtin_lasx_xvsrli_h(_1, 1); }
 // CHECK-LABEL: @xvsrli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrli_w(v8i32 _1) { return __builtin_lasx_xvsrli_w(_1, 1); }
 // CHECK-LABEL: @xvsrli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrli_d(v4i64 _1) { return __builtin_lasx_xvsrli_d(_1, 1); }
 // CHECK-LABEL: @xvsrlr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlr_b(_1, _2); }
 // CHECK-LABEL: @xvsrlr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlr_h(_1, _2); }
 // CHECK-LABEL: @xvsrlr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlr_w(_1, _2); }
 // CHECK-LABEL: @xvsrlr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlr_d(_1, _2); }
 // CHECK-LABEL: @xvsrlri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlri_b(v32i8 _1) { return __builtin_lasx_xvsrlri_b(_1, 1); }
 // CHECK-LABEL: @xvsrlri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlri_h(v16i16 _1) { return __builtin_lasx_xvsrlri_h(_1, 1); }
 // CHECK-LABEL: @xvsrlri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlri_w(v8i32 _1) { return __builtin_lasx_xvsrlri_w(_1, 1); }
 // CHECK-LABEL: @xvsrlri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlri_d(v4i64 _1) { return __builtin_lasx_xvsrlri_d(_1, 1); }
 // CHECK-LABEL: @xvbitclr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitclr_b(_1, _2); }
 // CHECK-LABEL: @xvbitclr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitclr_h(_1, _2); }
 // CHECK-LABEL: @xvbitclr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitclr_w(_1, _2); }
 // CHECK-LABEL: @xvbitclr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitclr_d(_1, _2); }
 // CHECK-LABEL: @xvbitclri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitclri_b(v32u8 _1) { return __builtin_lasx_xvbitclri_b(_1, 1); }
 // CHECK-LABEL: @xvbitclri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitclri_h(v16u16 _1) { return __builtin_lasx_xvbitclri_h(_1, 1); }
 // CHECK-LABEL: @xvbitclri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitclri_w(v8u32 _1) { return __builtin_lasx_xvbitclri_w(_1, 1); }
 // CHECK-LABEL: @xvbitclri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitclri_d(v4u64 _1) { return __builtin_lasx_xvbitclri_d(_1, 1); }
 // CHECK-LABEL: @xvbitset_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitset_b(_1, _2); }
 // CHECK-LABEL: @xvbitset_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitset_h(_1, _2); }
 // CHECK-LABEL: @xvbitset_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitset_w(_1, _2); }
 // CHECK-LABEL: @xvbitset_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitset_d(_1, _2); }
 // CHECK-LABEL: @xvbitseti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitseti_b(v32u8 _1) { return __builtin_lasx_xvbitseti_b(_1, 1); }
 // CHECK-LABEL: @xvbitseti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitseti_h(v16u16 _1) { return __builtin_lasx_xvbitseti_h(_1, 1); }
 // CHECK-LABEL: @xvbitseti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitseti_w(v8u32 _1) { return __builtin_lasx_xvbitseti_w(_1, 1); }
 // CHECK-LABEL: @xvbitseti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitseti_d(v4u64 _1) { return __builtin_lasx_xvbitseti_d(_1, 1); }
 // CHECK-LABEL: @xvbitrev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitrev_b(_1, _2); }
 // CHECK-LABEL: @xvbitrev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitrev_h(_1, _2); }
 // CHECK-LABEL: @xvbitrev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitrev_w(_1, _2); }
 // CHECK-LABEL: @xvbitrev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitrev_d(_1, _2); }
 // CHECK-LABEL: @xvbitrevi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitrevi_b(v32u8 _1) { return __builtin_lasx_xvbitrevi_b(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitrevi_h(v16u16 _1) { return __builtin_lasx_xvbitrevi_h(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitrevi_w(v8u32 _1) { return __builtin_lasx_xvbitrevi_w(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitrevi_d(v4u64 _1) { return __builtin_lasx_xvbitrevi_d(_1, 1); }
 // CHECK-LABEL: @xvadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadd_b(_1, _2); }
 // CHECK-LABEL: @xvadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadd_h(_1, _2); }
 // CHECK-LABEL: @xvadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadd_w(_1, _2); }
 // CHECK-LABEL: @xvadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_d(_1, _2); }
 // CHECK-LABEL: @xvaddi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvaddi_bu(v32i8 _1) { return __builtin_lasx_xvaddi_bu(_1, 1); }
 // CHECK-LABEL: @xvaddi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddi_hu(v16i16 _1) { return __builtin_lasx_xvaddi_hu(_1, 1); }
 // CHECK-LABEL: @xvaddi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddi_wu(v8i32 _1) { return __builtin_lasx_xvaddi_wu(_1, 1); }
 // CHECK-LABEL: @xvaddi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddi_du(v4i64 _1) { return __builtin_lasx_xvaddi_du(_1, 1); }
 // CHECK-LABEL: @xvsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsub_b(_1, _2); }
 // CHECK-LABEL: @xvsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsub_h(_1, _2); }
 // CHECK-LABEL: @xvsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsub_w(_1, _2); }
 // CHECK-LABEL: @xvsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_d(_1, _2); }
 // CHECK-LABEL: @xvsubi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsubi_bu(v32i8 _1) { return __builtin_lasx_xvsubi_bu(_1, 1); }
 // CHECK-LABEL: @xvsubi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubi_hu(v16i16 _1) { return __builtin_lasx_xvsubi_hu(_1, 1); }
 // CHECK-LABEL: @xvsubi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubi_wu(v8i32 _1) { return __builtin_lasx_xvsubi_wu(_1, 1); }
 // CHECK-LABEL: @xvsubi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubi_du(v4i64 _1) { return __builtin_lasx_xvsubi_du(_1, 1); }
 // CHECK-LABEL: @xvmax_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmax_b(_1, _2); }
 // CHECK-LABEL: @xvmax_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmax_h(_1, _2); }
 // CHECK-LABEL: @xvmax_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmax_w(_1, _2); }
 // CHECK-LABEL: @xvmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmax_d(_1, _2); }
 // CHECK-LABEL: @xvmaxi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmaxi_b(v32i8 _1) { return __builtin_lasx_xvmaxi_b(_1, 1); }
 // CHECK-LABEL: @xvmaxi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaxi_h(v16i16 _1) { return __builtin_lasx_xvmaxi_h(_1, 1); }
 // CHECK-LABEL: @xvmaxi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaxi_w(v8i32 _1) { return __builtin_lasx_xvmaxi_w(_1, 1); }
 // CHECK-LABEL: @xvmaxi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaxi_d(v4i64 _1) { return __builtin_lasx_xvmaxi_d(_1, 1); }
 // CHECK-LABEL: @xvmax_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmax_bu(_1, _2); }
 // CHECK-LABEL: @xvmax_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmax_hu(_1, _2); }
 // CHECK-LABEL: @xvmax_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmax_wu(_1, _2); }
 // CHECK-LABEL: @xvmax_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmax_du(_1, _2); }
 // CHECK-LABEL: @xvmaxi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmaxi_bu(v32u8 _1) { return __builtin_lasx_xvmaxi_bu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaxi_hu(v16u16 _1) { return __builtin_lasx_xvmaxi_hu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaxi_wu(v8u32 _1) { return __builtin_lasx_xvmaxi_wu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaxi_du(v4u64 _1) { return __builtin_lasx_xvmaxi_du(_1, 1); }
 // CHECK-LABEL: @xvmin_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmin_b(_1, _2); }
 // CHECK-LABEL: @xvmin_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmin_h(_1, _2); }
 // CHECK-LABEL: @xvmin_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmin_w(_1, _2); }
 // CHECK-LABEL: @xvmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmin_d(_1, _2); }
 // CHECK-LABEL: @xvmini_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmini_b(v32i8 _1) { return __builtin_lasx_xvmini_b(_1, 1); }
 // CHECK-LABEL: @xvmini_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmini_h(v16i16 _1) { return __builtin_lasx_xvmini_h(_1, 1); }
 // CHECK-LABEL: @xvmini_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmini_w(v8i32 _1) { return __builtin_lasx_xvmini_w(_1, 1); }
 // CHECK-LABEL: @xvmini_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmini_d(v4i64 _1) { return __builtin_lasx_xvmini_d(_1, 1); }
 // CHECK-LABEL: @xvmin_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmin_bu(_1, _2); }
 // CHECK-LABEL: @xvmin_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmin_hu(_1, _2); }
 // CHECK-LABEL: @xvmin_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmin_wu(_1, _2); }
 // CHECK-LABEL: @xvmin_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmin_du(_1, _2); }
 // CHECK-LABEL: @xvmini_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmini_bu(v32u8 _1) { return __builtin_lasx_xvmini_bu(_1, 1); }
 // CHECK-LABEL: @xvmini_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmini_hu(v16u16 _1) { return __builtin_lasx_xvmini_hu(_1, 1); }
 // CHECK-LABEL: @xvmini_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmini_wu(v8u32 _1) { return __builtin_lasx_xvmini_wu(_1, 1); }
 // CHECK-LABEL: @xvmini_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmini_du(v4u64 _1) { return __builtin_lasx_xvmini_du(_1, 1); }
 // CHECK-LABEL: @xvseq_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvseq_b(_1, _2); }
 // CHECK-LABEL: @xvseq_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvseq_h(_1, _2); }
 // CHECK-LABEL: @xvseq_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvseq_w(_1, _2); }
 // CHECK-LABEL: @xvseq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvseq_d(_1, _2); }
 // CHECK-LABEL: @xvseqi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvseqi_b(v32i8 _1) { return __builtin_lasx_xvseqi_b(_1, 1); }
 // CHECK-LABEL: @xvseqi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvseqi_h(v16i16 _1) { return __builtin_lasx_xvseqi_h(_1, 1); }
 // CHECK-LABEL: @xvseqi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvseqi_w(v8i32 _1) { return __builtin_lasx_xvseqi_w(_1, 1); }
 // CHECK-LABEL: @xvseqi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvseqi_d(v4i64 _1) { return __builtin_lasx_xvseqi_d(_1, 1); }
 // CHECK-LABEL: @xvslt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvslt_b(_1, _2); }
 // CHECK-LABEL: @xvslt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvslt_h(_1, _2); }
 // CHECK-LABEL: @xvslt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvslt_w(_1, _2); }
 // CHECK-LABEL: @xvslt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvslt_d(_1, _2); }
 // CHECK-LABEL: @xvslti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslti_b(v32i8 _1) { return __builtin_lasx_xvslti_b(_1, 1); }
 // CHECK-LABEL: @xvslti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslti_h(v16i16 _1) { return __builtin_lasx_xvslti_h(_1, 1); }
 // CHECK-LABEL: @xvslti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslti_w(v8i32 _1) { return __builtin_lasx_xvslti_w(_1, 1); }
 // CHECK-LABEL: @xvslti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslti_d(v4i64 _1) { return __builtin_lasx_xvslti_d(_1, 1); }
 // CHECK-LABEL: @xvslt_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvslt_bu(_1, _2); }
 // CHECK-LABEL: @xvslt_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvslt_hu(_1, _2); }
 // CHECK-LABEL: @xvslt_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvslt_wu(_1, _2); }
 // CHECK-LABEL: @xvslt_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvslt_du(_1, _2); }
 // CHECK-LABEL: @xvslti_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslti_bu(v32u8 _1) { return __builtin_lasx_xvslti_bu(_1, 1); }
 // CHECK-LABEL: @xvslti_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslti_hu(v16u16 _1) { return __builtin_lasx_xvslti_hu(_1, 1); }
 // CHECK-LABEL: @xvslti_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslti_wu(v8u32 _1) { return __builtin_lasx_xvslti_wu(_1, 1); }
 // CHECK-LABEL: @xvslti_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslti_du(v4u64 _1) { return __builtin_lasx_xvslti_du(_1, 1); }
 // CHECK-LABEL: @xvsle_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsle_b(_1, _2); }
 // CHECK-LABEL: @xvsle_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsle_h(_1, _2); }
 // CHECK-LABEL: @xvsle_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsle_w(_1, _2); }
 // CHECK-LABEL: @xvsle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsle_d(_1, _2); }
 // CHECK-LABEL: @xvslei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslei_b(v32i8 _1) { return __builtin_lasx_xvslei_b(_1, 1); }
 // CHECK-LABEL: @xvslei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslei_h(v16i16 _1) { return __builtin_lasx_xvslei_h(_1, 1); }
 // CHECK-LABEL: @xvslei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslei_w(v8i32 _1) { return __builtin_lasx_xvslei_w(_1, 1); }
 // CHECK-LABEL: @xvslei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslei_d(v4i64 _1) { return __builtin_lasx_xvslei_d(_1, 1); }
 // CHECK-LABEL: @xvsle_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsle_bu(_1, _2); }
 // CHECK-LABEL: @xvsle_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsle_hu(_1, _2); }
 // CHECK-LABEL: @xvsle_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsle_wu(_1, _2); }
 // CHECK-LABEL: @xvsle_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsle_du(_1, _2); }
 // CHECK-LABEL: @xvslei_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslei_bu(v32u8 _1) { return __builtin_lasx_xvslei_bu(_1, 1); }
 // CHECK-LABEL: @xvslei_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslei_hu(v16u16 _1) { return __builtin_lasx_xvslei_hu(_1, 1); }
 // CHECK-LABEL: @xvslei_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslei_wu(v8u32 _1) { return __builtin_lasx_xvslei_wu(_1, 1); }
 // CHECK-LABEL: @xvslei_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslei_du(v4u64 _1) { return __builtin_lasx_xvslei_du(_1, 1); }
 // CHECK-LABEL: @xvsat_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsat_b(v32i8 _1) { return __builtin_lasx_xvsat_b(_1, 1); }
 // CHECK-LABEL: @xvsat_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsat_h(v16i16 _1) { return __builtin_lasx_xvsat_h(_1, 1); }
 // CHECK-LABEL: @xvsat_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsat_w(v8i32 _1) { return __builtin_lasx_xvsat_w(_1, 1); }
 // CHECK-LABEL: @xvsat_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsat_d(v4i64 _1) { return __builtin_lasx_xvsat_d(_1, 1); }
 // CHECK-LABEL: @xvsat_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvsat_bu(v32u8 _1) { return __builtin_lasx_xvsat_bu(_1, 1); }
 // CHECK-LABEL: @xvsat_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsat_hu(v16u16 _1) { return __builtin_lasx_xvsat_hu(_1, 1); }
 // CHECK-LABEL: @xvsat_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsat_wu(v8u32 _1) { return __builtin_lasx_xvsat_wu(_1, 1); }
 // CHECK-LABEL: @xvsat_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsat_du(v4u64 _1) { return __builtin_lasx_xvsat_du(_1, 1); }
 // CHECK-LABEL: @xvadda_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadda_b(_1, _2); }
 // CHECK-LABEL: @xvadda_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadda_h(_1, _2); }
 // CHECK-LABEL: @xvadda_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadda_w(_1, _2); }
 // CHECK-LABEL: @xvadda_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadda_d(_1, _2); }
 // CHECK-LABEL: @xvsadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsadd_b(_1, _2); }
 // CHECK-LABEL: @xvsadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsadd_h(_1, _2); }
 // CHECK-LABEL: @xvsadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsadd_w(_1, _2); }
 // CHECK-LABEL: @xvsadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsadd_d(_1, _2); }
 // CHECK-LABEL: @xvsadd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsadd_bu(_1, _2); }
 // CHECK-LABEL: @xvsadd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsadd_hu(_1, _2); }
 // CHECK-LABEL: @xvsadd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsadd_wu(_1, _2); }
 // CHECK-LABEL: @xvsadd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsadd_du(_1, _2); }
 // CHECK-LABEL: @xvavg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavg_b(_1, _2); }
 // CHECK-LABEL: @xvavg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavg_h(_1, _2); }
 // CHECK-LABEL: @xvavg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavg_w(_1, _2); }
 // CHECK-LABEL: @xvavg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavg_d(_1, _2); }
 // CHECK-LABEL: @xvavg_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavg_bu(_1, _2); }
 // CHECK-LABEL: @xvavg_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavg_hu(_1, _2); }
 // CHECK-LABEL: @xvavg_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavg_wu(_1, _2); }
 // CHECK-LABEL: @xvavg_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavg_du(_1, _2); }
 // CHECK-LABEL: @xvavgr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavgr_b(_1, _2); }
 // CHECK-LABEL: @xvavgr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavgr_h(_1, _2); }
 // CHECK-LABEL: @xvavgr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavgr_w(_1, _2); }
 // CHECK-LABEL: @xvavgr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavgr_d(_1, _2); }
 // CHECK-LABEL: @xvavgr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavgr_bu(_1, _2); }
 // CHECK-LABEL: @xvavgr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavgr_hu(_1, _2); }
 // CHECK-LABEL: @xvavgr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavgr_wu(_1, _2); }
 // CHECK-LABEL: @xvavgr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavgr_du(_1, _2); }
 // CHECK-LABEL: @xvssub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssub_b(_1, _2); }
 // CHECK-LABEL: @xvssub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssub_h(_1, _2); }
 // CHECK-LABEL: @xvssub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssub_w(_1, _2); }
 // CHECK-LABEL: @xvssub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssub_d(_1, _2); }
 // CHECK-LABEL: @xvssub_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvssub_bu(_1, _2); }
 // CHECK-LABEL: @xvssub_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssub_hu(_1, _2); }
 // CHECK-LABEL: @xvssub_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssub_wu(_1, _2); }
 // CHECK-LABEL: @xvssub_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssub_du(_1, _2); }
 // CHECK-LABEL: @xvabsd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvabsd_b(_1, _2); }
 // CHECK-LABEL: @xvabsd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvabsd_h(_1, _2); }
 // CHECK-LABEL: @xvabsd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvabsd_w(_1, _2); }
 // CHECK-LABEL: @xvabsd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvabsd_d(_1, _2); }
 // CHECK-LABEL: @xvabsd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvabsd_bu(_1, _2); }
 // CHECK-LABEL: @xvabsd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvabsd_hu(_1, _2); }
 // CHECK-LABEL: @xvabsd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvabsd_wu(_1, _2); }
 // CHECK-LABEL: @xvabsd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvabsd_du(_1, _2); }
 // CHECK-LABEL: @xvmul_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmul_b(_1, _2); }
 // CHECK-LABEL: @xvmul_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmul_h(_1, _2); }
 // CHECK-LABEL: @xvmul_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmul_w(_1, _2); }
 // CHECK-LABEL: @xvmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmul_d(_1, _2); }
 // CHECK-LABEL: @xvmadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmadd_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmadd_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmadd_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmsub_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmsub_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmsub_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvdiv_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvdiv_b(_1, _2); }
 // CHECK-LABEL: @xvdiv_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvdiv_h(_1, _2); }
 // CHECK-LABEL: @xvdiv_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvdiv_w(_1, _2); }
 // CHECK-LABEL: @xvdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvdiv_d(_1, _2); }
 // CHECK-LABEL: @xvdiv_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvdiv_bu(_1, _2); }
 // CHECK-LABEL: @xvdiv_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvdiv_hu(_1, _2); }
 // CHECK-LABEL: @xvdiv_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvdiv_wu(_1, _2); }
 // CHECK-LABEL: @xvdiv_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvdiv_du(_1, _2); }
 // CHECK-LABEL: @xvhaddw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhaddw_h_b(_1, _2); }
 // CHECK-LABEL: @xvhaddw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhaddw_w_h(_1, _2); }
 // CHECK-LABEL: @xvhaddw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhaddw_d_w(_1, _2); }
 // CHECK-LABEL: @xvhaddw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhaddw_hu_bu(_1, _2); }
 // CHECK-LABEL: @xvhaddw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhaddw_wu_hu(_1, _2); }
 // CHECK-LABEL: @xvhaddw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhaddw_du_wu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhsubw_h_b(_1, _2); }
 // CHECK-LABEL: @xvhsubw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhsubw_w_h(_1, _2); }
 // CHECK-LABEL: @xvhsubw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhsubw_d_w(_1, _2); }
 // CHECK-LABEL: @xvhsubw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhsubw_hu_bu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhsubw_wu_hu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhsubw_du_wu(_1, _2); }
 // CHECK-LABEL: @xvmod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmod_b(_1, _2); }
 // CHECK-LABEL: @xvmod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmod_h(_1, _2); }
 // CHECK-LABEL: @xvmod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmod_w(_1, _2); }
 // CHECK-LABEL: @xvmod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmod_d(_1, _2); }
 // CHECK-LABEL: @xvmod_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmod_bu(_1, _2); }
 // CHECK-LABEL: @xvmod_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmod_hu(_1, _2); }
 // CHECK-LABEL: @xvmod_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmod_wu(_1, _2); }
 // CHECK-LABEL: @xvmod_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmod_du(_1, _2); }
 // CHECK-LABEL: @xvrepl128vei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrepl128vei_b(v32i8 _1) { return __builtin_lasx_xvrepl128vei_b(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrepl128vei_h(v16i16 _1) { return __builtin_lasx_xvrepl128vei_h(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrepl128vei_w(v8i32 _1) { return __builtin_lasx_xvrepl128vei_w(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrepl128vei_d(v4i64 _1) { return __builtin_lasx_xvrepl128vei_d(_1, 1); }
 // CHECK-LABEL: @xvpickev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickev_b(_1, _2); }
 // CHECK-LABEL: @xvpickev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickev_h(_1, _2); }
 // CHECK-LABEL: @xvpickev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickev_w(_1, _2); }
 // CHECK-LABEL: @xvpickev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickev_d(_1, _2); }
 // CHECK-LABEL: @xvpickod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickod_b(_1, _2); }
 // CHECK-LABEL: @xvpickod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickod_h(_1, _2); }
 // CHECK-LABEL: @xvpickod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickod_w(_1, _2); }
 // CHECK-LABEL: @xvpickod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickod_d(_1, _2); }
 // CHECK-LABEL: @xvilvh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvh_b(_1, _2); }
 // CHECK-LABEL: @xvilvh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvh_h(_1, _2); }
 // CHECK-LABEL: @xvilvh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvh_w(_1, _2); }
 // CHECK-LABEL: @xvilvh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvh_d(_1, _2); }
 // CHECK-LABEL: @xvilvl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvl_b(_1, _2); }
 // CHECK-LABEL: @xvilvl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvl_h(_1, _2); }
 // CHECK-LABEL: @xvilvl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvl_w(_1, _2); }
 // CHECK-LABEL: @xvilvl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvl_d(_1, _2); }
 // CHECK-LABEL: @xvpackev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackev_b(_1, _2); }
 // CHECK-LABEL: @xvpackev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackev_h(_1, _2); }
 // CHECK-LABEL: @xvpackev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackev_w(_1, _2); }
 // CHECK-LABEL: @xvpackev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackev_d(_1, _2); }
 // CHECK-LABEL: @xvpackod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackod_b(_1, _2); }
 // CHECK-LABEL: @xvpackod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackod_h(_1, _2); }
 // CHECK-LABEL: @xvpackod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackod_w(_1, _2); }
 // CHECK-LABEL: @xvpackod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackod_d(_1, _2); }
 // CHECK-LABEL: @xvshuf_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvshuf_b(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvshuf_h(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvshuf_w(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvshuf_d(_1, _2, _3); }
 // CHECK-LABEL: @xvand_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvand_v(_1, _2); }
 // CHECK-LABEL: @xvandi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvandi_b(v32u8 _1) { return __builtin_lasx_xvandi_b(_1, 1); }
 // CHECK-LABEL: @xvor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvor_v(_1, _2); }
 // CHECK-LABEL: @xvori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvori_b(v32u8 _1) { return __builtin_lasx_xvori_b(_1, 1); }
 // CHECK-LABEL: @xvnor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvnor_v(_1, _2); }
 // CHECK-LABEL: @xvnori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvnori_b(v32u8 _1) { return __builtin_lasx_xvnori_b(_1, 1); }
 // CHECK-LABEL: @xvxor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvxor_v(_1, _2); }
 // CHECK-LABEL: @xvxori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvxori_b(v32u8 _1) { return __builtin_lasx_xvxori_b(_1, 1); }
 // CHECK-LABEL: @xvbitsel_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvbitsel_v(_1, _2, _3); }
 // CHECK-LABEL: @xvbitseli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitseli_b(_1, _2, 1); }
 // CHECK-LABEL: @xvshuf4i_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvshuf4i_b(v32i8 _1) { return __builtin_lasx_xvshuf4i_b(_1, 1); }
 // CHECK-LABEL: @xvshuf4i_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvshuf4i_h(v16i16 _1) { return __builtin_lasx_xvshuf4i_h(_1, 1); }
 // CHECK-LABEL: @xvshuf4i_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvshuf4i_w(v8i32 _1) { return __builtin_lasx_xvshuf4i_w(_1, 1); }
 // CHECK-LABEL: @xvreplgr2vr_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplgr2vr_b(int _1) { return __builtin_lasx_xvreplgr2vr_b(_1); }
 // CHECK-LABEL: @xvreplgr2vr_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplgr2vr_h(int _1) { return __builtin_lasx_xvreplgr2vr_h(_1); }
 // CHECK-LABEL: @xvreplgr2vr_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplgr2vr_w(int _1) { return __builtin_lasx_xvreplgr2vr_w(_1); }
 // CHECK-LABEL: @xvreplgr2vr_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplgr2vr_d(int _1) { return __builtin_lasx_xvreplgr2vr_d(_1); }
 // CHECK-LABEL: @xvpcnt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpcnt_b(v32i8 _1) { return __builtin_lasx_xvpcnt_b(_1); }
 // CHECK-LABEL: @xvpcnt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpcnt_h(v16i16 _1) { return __builtin_lasx_xvpcnt_h(_1); }
 // CHECK-LABEL: @xvpcnt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpcnt_w(v8i32 _1) { return __builtin_lasx_xvpcnt_w(_1); }
 // CHECK-LABEL: @xvpcnt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpcnt_d(v4i64 _1) { return __builtin_lasx_xvpcnt_d(_1); }
 // CHECK-LABEL: @xvclo_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvclo_b(v32i8 _1) { return __builtin_lasx_xvclo_b(_1); }
 // CHECK-LABEL: @xvclo_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvclo_h(v16i16 _1) { return __builtin_lasx_xvclo_h(_1); }
 // CHECK-LABEL: @xvclo_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvclo_w(v8i32 _1) { return __builtin_lasx_xvclo_w(_1); }
 // CHECK-LABEL: @xvclo_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvclo_d(v4i64 _1) { return __builtin_lasx_xvclo_d(_1); }
 // CHECK-LABEL: @xvclz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvclz_b(v32i8 _1) { return __builtin_lasx_xvclz_b(_1); }
 // CHECK-LABEL: @xvclz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvclz_h(v16i16 _1) { return __builtin_lasx_xvclz_h(_1); }
 // CHECK-LABEL: @xvclz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvclz_w(v8i32 _1) { return __builtin_lasx_xvclz_w(_1); }
 // CHECK-LABEL: @xvclz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvclz_d(v4i64 _1) { return __builtin_lasx_xvclz_d(_1); }
 // CHECK-LABEL: @xvfadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfadd_s(_1, _2); }
 // CHECK-LABEL: @xvfadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfadd_d(_1, _2); }
 // CHECK-LABEL: @xvfsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfsub_s(_1, _2); }
 // CHECK-LABEL: @xvfsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfsub_d(_1, _2); }
 // CHECK-LABEL: @xvfmul_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmul_s(_1, _2); }
 // CHECK-LABEL: @xvfmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmul_d(_1, _2); }
 // CHECK-LABEL: @xvfdiv_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfdiv_s(_1, _2); }
 // CHECK-LABEL: @xvfdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfdiv_d(_1, _2); }
 // CHECK-LABEL: @xvfcvt_h_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcvt_h_s(_1, _2); }
 // CHECK-LABEL: @xvfcvt_s_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcvt_s_d(_1, _2); }
 // CHECK-LABEL: @xvfmin_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmin_s(_1, _2); }
 // CHECK-LABEL: @xvfmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmin_d(_1, _2); }
 // CHECK-LABEL: @xvfmina_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmina_s(_1, _2); }
 // CHECK-LABEL: @xvfmina_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmina_d(_1, _2); }
 // CHECK-LABEL: @xvfmax_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmax_s(_1, _2); }
 // CHECK-LABEL: @xvfmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmax_d(_1, _2); }
 // CHECK-LABEL: @xvfmaxa_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmaxa_s(_1, _2); }
 // CHECK-LABEL: @xvfmaxa_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmaxa_d(_1, _2); }
 // CHECK-LABEL: @xvfclass_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfclass_s(v8f32 _1) { return __builtin_lasx_xvfclass_s(_1); }
 // CHECK-LABEL: @xvfclass_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfclass_d(v4f64 _1) { return __builtin_lasx_xvfclass_d(_1); }
 // CHECK-LABEL: @xvfsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfsqrt_s(v8f32 _1) { return __builtin_lasx_xvfsqrt_s(_1); }
 // CHECK-LABEL: @xvfsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfsqrt_d(v4f64 _1) { return __builtin_lasx_xvfsqrt_d(_1); }
 // CHECK-LABEL: @xvfrecip_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrecip_s(v8f32 _1) { return __builtin_lasx_xvfrecip_s(_1); }
 // CHECK-LABEL: @xvfrecip_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrecip_d(v4f64 _1) { return __builtin_lasx_xvfrecip_d(_1); }
 // CHECK-LABEL: @xvfrint_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrint_s(v8f32 _1) { return __builtin_lasx_xvfrint_s(_1); }
 // CHECK-LABEL: @xvfrint_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrint_d(v4f64 _1) { return __builtin_lasx_xvfrint_d(_1); }
 // CHECK-LABEL: @xvfrsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrt_s(v8f32 _1) { return __builtin_lasx_xvfrsqrt_s(_1); }
 // CHECK-LABEL: @xvfrsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrt_d(v4f64 _1) { return __builtin_lasx_xvfrsqrt_d(_1); }
 // CHECK-LABEL: @xvflogb_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvflogb_s(v8f32 _1) { return __builtin_lasx_xvflogb_s(_1); }
 // CHECK-LABEL: @xvflogb_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvflogb_d(v4f64 _1) { return __builtin_lasx_xvflogb_d(_1); }
 // CHECK-LABEL: @xvfcvth_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvth_s_h(v16i16 _1) { return __builtin_lasx_xvfcvth_s_h(_1); }
 // CHECK-LABEL: @xvfcvth_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfcvth_d_s(v8f32 _1) { return __builtin_lasx_xvfcvth_d_s(_1); }
 // CHECK-LABEL: @xvfcvtl_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvtl_s_h(v16i16 _1) { return __builtin_lasx_xvfcvtl_s_h(_1); }
 // CHECK-LABEL: @xvfcvtl_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfcvtl_d_s(v8f32 _1) { return __builtin_lasx_xvfcvtl_d_s(_1); }
 // CHECK-LABEL: @xvftint_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_s(v8f32 _1) { return __builtin_lasx_xvftint_w_s(_1); }
 // CHECK-LABEL: @xvftint_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftint_l_d(v4f64 _1) { return __builtin_lasx_xvftint_l_d(_1); }
 // CHECK-LABEL: @xvftint_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvftint_wu_s(v8f32 _1) { return __builtin_lasx_xvftint_wu_s(_1); }
 // CHECK-LABEL: @xvftint_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvftint_lu_d(v4f64 _1) { return __builtin_lasx_xvftint_lu_d(_1); }
 // CHECK-LABEL: @xvftintrz_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_s(v8f32 _1) { return __builtin_lasx_xvftintrz_w_s(_1); }
 // CHECK-LABEL: @xvftintrz_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrz_l_d(v4f64 _1) { return __builtin_lasx_xvftintrz_l_d(_1); }
 // CHECK-LABEL: @xvftintrz_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvftintrz_wu_s(v8f32 _1) { return __builtin_lasx_xvftintrz_wu_s(_1); }
 // CHECK-LABEL: @xvftintrz_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvftintrz_lu_d(v4f64 _1) { return __builtin_lasx_xvftintrz_lu_d(_1); }
 // CHECK-LABEL: @xvffint_s_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_w(v8i32 _1) { return __builtin_lasx_xvffint_s_w(_1); }
 // CHECK-LABEL: @xvffint_d_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_l(v4i64 _1) { return __builtin_lasx_xvffint_d_l(_1); }
 // CHECK-LABEL: @xvffint_s_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_wu(v8u32 _1) { return __builtin_lasx_xvffint_s_wu(_1); }
 // CHECK-LABEL: @xvffint_d_lu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_lu(v4u64 _1) { return __builtin_lasx_xvffint_d_lu(_1); }
 // CHECK-LABEL: @xvreplve_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve_b(v32i8 _1, int _2) { return __builtin_lasx_xvreplve_b(_1, _2); }
 // CHECK-LABEL: @xvreplve_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplve_h(v16i16 _1, int _2) { return __builtin_lasx_xvreplve_h(_1, _2); }
 // CHECK-LABEL: @xvreplve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplve_w(v8i32 _1, int _2) { return __builtin_lasx_xvreplve_w(_1, _2); }
 // CHECK-LABEL: @xvreplve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplve_d(v4i64 _1, int _2) { return __builtin_lasx_xvreplve_d(_1, _2); }
 // CHECK-LABEL: @xvpermi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpermi_w(_1, _2, 1); }
 // CHECK-LABEL: @xvandn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvandn_v(_1, _2); }
 // CHECK-LABEL: @xvneg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvneg_b(v32i8 _1) { return __builtin_lasx_xvneg_b(_1); }
 // CHECK-LABEL: @xvneg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvneg_h(v16i16 _1) { return __builtin_lasx_xvneg_h(_1); }
 // CHECK-LABEL: @xvneg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvneg_w(v8i32 _1) { return __builtin_lasx_xvneg_w(_1); }
 // CHECK-LABEL: @xvneg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvneg_d(v4i64 _1) { return __builtin_lasx_xvneg_d(_1); }
 // CHECK-LABEL: @xvmuh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmuh_b(_1, _2); }
 // CHECK-LABEL: @xvmuh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmuh_h(_1, _2); }
 // CHECK-LABEL: @xvmuh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmuh_w(_1, _2); }
 // CHECK-LABEL: @xvmuh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmuh_d(_1, _2); }
 // CHECK-LABEL: @xvmuh_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmuh_bu(_1, _2); }
 // CHECK-LABEL: @xvmuh_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmuh_hu(_1, _2); }
 // CHECK-LABEL: @xvmuh_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmuh_wu(_1, _2); }
 // CHECK-LABEL: @xvmuh_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmuh_du(_1, _2); }
 // CHECK-LABEL: @xvsllwil_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsllwil_h_b(v32i8 _1) { return __builtin_lasx_xvsllwil_h_b(_1, 1); }
 // CHECK-LABEL: @xvsllwil_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsllwil_w_h(v16i16 _1) { return __builtin_lasx_xvsllwil_w_h(_1, 1); }
 // CHECK-LABEL: @xvsllwil_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsllwil_d_w(v8i32 _1) { return __builtin_lasx_xvsllwil_d_w(_1, 1); }
 // CHECK-LABEL: @xvsllwil_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsllwil_hu_bu(v32u8 _1) { return __builtin_lasx_xvsllwil_hu_bu(_1, 1); }
 // CHECK-LABEL: @xvsllwil_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsllwil_wu_hu(v16u16 _1) { return __builtin_lasx_xvsllwil_wu_hu(_1, 1); }
 // CHECK-LABEL: @xvsllwil_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsllwil_du_wu(v8u32 _1) { return __builtin_lasx_xvsllwil_du_wu(_1, 1); }
 // CHECK-LABEL: @xvsran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsran_b_h(_1, _2); }
 // CHECK-LABEL: @xvsran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsran_h_w(_1, _2); }
 // CHECK-LABEL: @xvsran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsran_w_d(_1, _2); }
 // CHECK-LABEL: @xvssran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssran_b_h(_1, _2); }
 // CHECK-LABEL: @xvssran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssran_h_w(_1, _2); }
 // CHECK-LABEL: @xvssran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssran_w_d(_1, _2); }
 // CHECK-LABEL: @xvssran_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssran_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssran_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssran_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssran_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssran_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarn_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarn_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarn_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarn_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrarn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrarn_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrarn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrarn_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrarn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrarn_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrln_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrln_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrln_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrln_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrln_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrln_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrln_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrln_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrln_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrlrn_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrlrn_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrlrn_wu_d(_1, _2); }
 // CHECK-LABEL: @xvfrstpi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvfrstpi_b(_1, _2, 1); }
 // CHECK-LABEL: @xvfrstpi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvfrstpi_h(_1, _2, 1); }
 // CHECK-LABEL: @xvfrstp_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvfrstp_b(_1, _2, _3); }
 // CHECK-LABEL: @xvfrstp_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvfrstp_h(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf4i_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvshuf4i_d(_1, _2, 1); }
 // CHECK-LABEL: @xvbsrl_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvbsrl_v(v32i8 _1) { return __builtin_lasx_xvbsrl_v(_1, 1); }
 // CHECK-LABEL: @xvbsll_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvbsll_v(v32i8 _1) { return __builtin_lasx_xvbsll_v(_1, 1); }
 // CHECK-LABEL: @xvextrins_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvextrins_b(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvextrins_h(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvextrins_w(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvextrins_d(_1, _2, 1); }
 // CHECK-LABEL: @xvmskltz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmskltz_b(v32i8 _1) { return __builtin_lasx_xvmskltz_b(_1); }
 // CHECK-LABEL: @xvmskltz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmskltz_h(v16i16 _1) { return __builtin_lasx_xvmskltz_h(_1); }
 // CHECK-LABEL: @xvmskltz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmskltz_w(v8i32 _1) { return __builtin_lasx_xvmskltz_w(_1); }
 // CHECK-LABEL: @xvmskltz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmskltz_d(v4i64 _1) { return __builtin_lasx_xvmskltz_d(_1); }
 // CHECK-LABEL: @xvsigncov_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsigncov_b(_1, _2); }
 // CHECK-LABEL: @xvsigncov_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsigncov_h(_1, _2); }
 // CHECK-LABEL: @xvsigncov_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsigncov_w(_1, _2); }
 // CHECK-LABEL: @xvsigncov_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsigncov_d(_1, _2); }
 // CHECK-LABEL: @xvfmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmadd_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmsub_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmadd_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmsub_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvftintrne_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_s(v8f32 _1) { return __builtin_lasx_xvftintrne_w_s(_1); }
 // CHECK-LABEL: @xvftintrne_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrne_l_d(v4f64 _1) { return __builtin_lasx_xvftintrne_l_d(_1); }
 // CHECK-LABEL: @xvftintrp_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_s(v8f32 _1) { return __builtin_lasx_xvftintrp_w_s(_1); }
 // CHECK-LABEL: @xvftintrp_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrp_l_d(v4f64 _1) { return __builtin_lasx_xvftintrp_l_d(_1); }
 // CHECK-LABEL: @xvftintrm_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_s(v8f32 _1) { return __builtin_lasx_xvftintrm_w_s(_1); }
 // CHECK-LABEL: @xvftintrm_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrm_l_d(v4f64 _1) { return __builtin_lasx_xvftintrm_l_d(_1); }
 // CHECK-LABEL: @xvftint_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftint_w_d(_1, _2); }
 // CHECK-LABEL: @xvffint_s_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvffint_s_l(_1, _2); }
 // CHECK-LABEL: @xvftintrz_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrz_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrp_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrp_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrm_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrm_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrne_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrne_w_d(_1, _2); }
 // CHECK-LABEL: @xvftinth_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftinth_l_s(v8f32 _1) { return __builtin_lasx_xvftinth_l_s(_1); }
 // CHECK-LABEL: @xvftintl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintl_l_s(v8f32 _1) { return __builtin_lasx_xvftintl_l_s(_1); }
 // CHECK-LABEL: @xvffinth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffinth_d_w(v8i32 _1) { return __builtin_lasx_xvffinth_d_w(_1); }
 // CHECK-LABEL: @xvffintl_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffintl_d_w(v8i32 _1) { return __builtin_lasx_xvffintl_d_w(_1); }
 // CHECK-LABEL: @xvftintrzh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzh_l_s(_1); }
 // CHECK-LABEL: @xvftintrzl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzl_l_s(_1); }
 // CHECK-LABEL: @xvftintrph_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrph_l_s(v8f32 _1) { return __builtin_lasx_xvftintrph_l_s(_1); }
 // CHECK-LABEL: @xvftintrpl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrpl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrpl_l_s(_1); }
 // CHECK-LABEL: @xvftintrmh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrmh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrmh_l_s(_1); }
 // CHECK-LABEL: @xvftintrml_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrml_l_s(v8f32 _1) { return __builtin_lasx_xvftintrml_l_s(_1); }
 // CHECK-LABEL: @xvftintrneh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrneh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrneh_l_s(_1); }
 // CHECK-LABEL: @xvftintrnel_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrnel_l_s(v8f32 _1) { return __builtin_lasx_xvftintrnel_l_s(_1); }
 // CHECK-LABEL: @xvfrintrne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrne_s(v8f32 _1) { return __builtin_lasx_xvfrintrne_s(_1); }
 // CHECK-LABEL: @xvfrintrne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrne_d(v4f64 _1) { return __builtin_lasx_xvfrintrne_d(_1); }
 // CHECK-LABEL: @xvfrintrz_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrz_s(v8f32 _1) { return __builtin_lasx_xvfrintrz_s(_1); }
 // CHECK-LABEL: @xvfrintrz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrz_d(v4f64 _1) { return __builtin_lasx_xvfrintrz_d(_1); }
 // CHECK-LABEL: @xvfrintrp_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrp_s(v8f32 _1) { return __builtin_lasx_xvfrintrp_s(_1); }
 // CHECK-LABEL: @xvfrintrp_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrp_d(v4f64 _1) { return __builtin_lasx_xvfrintrp_d(_1); }
 // CHECK-LABEL: @xvfrintrm_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrm_s(v8f32 _1) { return __builtin_lasx_xvfrintrm_s(_1); }
 // CHECK-LABEL: @xvfrintrm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrm_d(v4f64 _1) { return __builtin_lasx_xvfrintrm_d(_1); }
 // CHECK-LABEL: @xvld(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvld(void *_1) { return __builtin_lasx_xvld(_1, 1); }
 // CHECK-LABEL: @xvst(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvst(v32i8 _1, void *_2) { return __builtin_lasx_xvst(_1, _2, 1); }
 // CHECK-LABEL: @xvstelm_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_b(v32i8 _1, void * _2) { return __builtin_lasx_xvstelm_b(_1, _2, 1, 1); }
 // CHECK-LABEL: @xvstelm_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2:%.*]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_h(v16i16 _1, void * _2) { return __builtin_lasx_xvstelm_h(_1, _2, 2, 1); }
 // CHECK-LABEL: @xvstelm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2:%.*]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_w(v8i32 _1, void * _2) { return __builtin_lasx_xvstelm_w(_1, _2, 4, 1); }
 // CHECK-LABEL: @xvstelm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2:%.*]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_d(v4i64 _1, void * _2) { return __builtin_lasx_xvstelm_d(_1, _2, 8, 1); }
 // CHECK-LABEL: @xvinsve0_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvinsve0_w(_1, _2, 1); }
 // CHECK-LABEL: @xvinsve0_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvinsve0_d(_1, _2, 1); }
 // CHECK-LABEL: @xvpickve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickve_w(v8i32 _1) { return __builtin_lasx_xvpickve_w(_1, 1); }
 // CHECK-LABEL: @xvpickve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickve_d(v4i64 _1) { return __builtin_lasx_xvpickve_d(_1, 1); }
 // CHECK-LABEL: @xvssrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrln_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrln_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrln_w_d(_1, _2); }
 // CHECK-LABEL: @xvorn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvorn_v(_1, _2); }
 // CHECK-LABEL: @xvldi(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvldi() { return __builtin_lasx_xvldi(1); }
 // CHECK-LABEL: @xvldx(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvldx(void *_1) { return __builtin_lasx_xvldx(_1, 1); }
 // CHECK-LABEL: @xvstx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1]], ptr [[_2:%.*]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void xvstx(v32i8 _1, void *_2) { return __builtin_lasx_xvstx(_1, _2, 1); }
 // CHECK-LABEL: @xvextl_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvextl_qu_du(v4u64 _1) { return __builtin_lasx_xvextl_qu_du(_1); }
 // CHECK-LABEL: @xvinsgr2vr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1]], i32 1, i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvinsgr2vr_w(v8i32 _1) { return __builtin_lasx_xvinsgr2vr_w(_1, 1, 1); }
 // CHECK-LABEL: @xvinsgr2vr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1:%.*]], i64 1, i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1]], i64 1, i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvinsgr2vr_d(v4i64 _1) { return __builtin_lasx_xvinsgr2vr_d(_1, 1, 1); }
 // CHECK-LABEL: @xvreplve0_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_b(v32i8 _1) { return __builtin_lasx_xvreplve0_b(_1); }
 // CHECK-LABEL: @xvreplve0_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplve0_h(v16i16 _1) { return __builtin_lasx_xvreplve0_h(_1); }
 // CHECK-LABEL: @xvreplve0_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplve0_w(v8i32 _1) { return __builtin_lasx_xvreplve0_w(_1); }
 // CHECK-LABEL: @xvreplve0_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplve0_d(v4i64 _1) { return __builtin_lasx_xvreplve0_d(_1); }
 // CHECK-LABEL: @xvreplve0_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_q(v32i8 _1) { return __builtin_lasx_xvreplve0_q(_1); }
 // CHECK-LABEL: @vext2xv_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_h_b(v32i8 _1) { return __builtin_lasx_vext2xv_h_b(_1); }
 // CHECK-LABEL: @vext2xv_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_h(v16i16 _1) { return __builtin_lasx_vext2xv_w_h(_1); }
 // CHECK-LABEL: @vext2xv_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_w(v8i32 _1) { return __builtin_lasx_vext2xv_d_w(_1); }
 // CHECK-LABEL: @vext2xv_w_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_b(v32i8 _1) { return __builtin_lasx_vext2xv_w_b(_1); }
 // CHECK-LABEL: @vext2xv_d_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_h(v16i16 _1) { return __builtin_lasx_vext2xv_d_h(_1); }
 // CHECK-LABEL: @vext2xv_d_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_b(v32i8 _1) { return __builtin_lasx_vext2xv_d_b(_1); }
 // CHECK-LABEL: @vext2xv_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_hu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_hu_bu(_1); }
 // CHECK-LABEL: @vext2xv_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_hu(v16i16 _1) { return __builtin_lasx_vext2xv_wu_hu(_1); }
 // CHECK-LABEL: @vext2xv_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_wu(v8i32 _1) { return __builtin_lasx_vext2xv_du_wu(_1); }
 // CHECK-LABEL: @vext2xv_wu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_wu_bu(_1); }
 // CHECK-LABEL: @vext2xv_du_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_hu(v16i16 _1) { return __builtin_lasx_vext2xv_du_hu(_1); }
 // CHECK-LABEL: @vext2xv_du_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_bu(v32i8 _1) { return __builtin_lasx_vext2xv_du_bu(_1); }
 // CHECK-LABEL: @xvpermi_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpermi_q(_1, _2, 1); }
 // CHECK-LABEL: @xvpermi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpermi_d(v4i64 _1) { return __builtin_lasx_xvpermi_d(_1, 1); }
 // CHECK-LABEL: @xvperm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvperm_w(_1, _2); }
 // CHECK-LABEL: @xvldrepl_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvldrepl_b(void *_1) { return __builtin_lasx_xvldrepl_b(_1, 1); }
 // CHECK-LABEL: @xvldrepl_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvldrepl_h(void *_1) { return __builtin_lasx_xvldrepl_h(_1, 2); }
 // CHECK-LABEL: @xvldrepl_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvldrepl_w(void *_1) { return __builtin_lasx_xvldrepl_w(_1, 4); }
 // CHECK-LABEL: @xvldrepl_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvldrepl_d(void *_1) { return __builtin_lasx_xvldrepl_d(_1, 8); }
 // CHECK-LABEL: @xvpickve2gr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xvpickve2gr_w(v8i32 _1) { return __builtin_lasx_xvpickve2gr_w(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int xvpickve2gr_wu(v8i32 _1) { return __builtin_lasx_xvpickve2gr_wu(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long xvpickve2gr_d(v4i64 _1) { return __builtin_lasx_xvpickve2gr_d(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int xvpickve2gr_du(v4i64 _1) { return __builtin_lasx_xvpickve2gr_du(_1, 1); }
 // CHECK-LABEL: @xvaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvsubwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvsubwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvsubwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvsubwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvsubwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvsubwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvsubwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvsubwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvsubwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvsubwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvhaddw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhaddw_q_d(_1, _2); }
 // CHECK-LABEL: @xvhaddw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhaddw_qu_du(_1, _2); }
 // CHECK-LABEL: @xvhsubw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhsubw_q_d(_1, _2); }
 // CHECK-LABEL: @xvhsubw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhsubw_qu_du(_1, _2); }
 // CHECK-LABEL: @xvmaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwev_q_du(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwev_d_wu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwev_w_hu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwev_h_bu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwod_q_du(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwod_d_wu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwod_w_hu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwod_h_bu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: @xvrotr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvrotr_b(_1, _2); }
 // CHECK-LABEL: @xvrotr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvrotr_h(_1, _2); }
 // CHECK-LABEL: @xvrotr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvrotr_w(_1, _2); }
 // CHECK-LABEL: @xvrotr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvrotr_d(_1, _2); }
 // CHECK-LABEL: @xvadd_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_q(_1, _2); }
 // CHECK-LABEL: @xvsub_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_q(_1, _2); }
 // CHECK-LABEL: @xvaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmskgez_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmskgez_b(v32i8 _1) { return __builtin_lasx_xvmskgez_b(_1); }
 // CHECK-LABEL: @xvmsknz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmsknz_b(v32i8 _1) { return __builtin_lasx_xvmsknz_b(_1); }
 // CHECK-LABEL: @xvexth_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvexth_h_b(v32i8 _1) { return __builtin_lasx_xvexth_h_b(_1); }
 // CHECK-LABEL: @xvexth_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvexth_w_h(v16i16 _1) { return __builtin_lasx_xvexth_w_h(_1); }
 // CHECK-LABEL: @xvexth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvexth_d_w(v8i32 _1) { return __builtin_lasx_xvexth_d_w(_1); }
 // CHECK-LABEL: @xvexth_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvexth_q_d(v4i64 _1) { return __builtin_lasx_xvexth_q_d(_1); }
 // CHECK-LABEL: @xvexth_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvexth_hu_bu(v32u8 _1) { return __builtin_lasx_xvexth_hu_bu(_1); }
 // CHECK-LABEL: @xvexth_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvexth_wu_hu(v16u16 _1) { return __builtin_lasx_xvexth_wu_hu(_1); }
 // CHECK-LABEL: @xvexth_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvexth_du_wu(v8u32 _1) { return __builtin_lasx_xvexth_du_wu(_1); }
 // CHECK-LABEL: @xvexth_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvexth_qu_du(v4u64 _1) { return __builtin_lasx_xvexth_qu_du(_1); }
 // CHECK-LABEL: @xvrotri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrotri_b(v32i8 _1) { return __builtin_lasx_xvrotri_b(_1, 1); }
 // CHECK-LABEL: @xvrotri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrotri_h(v16i16 _1) { return __builtin_lasx_xvrotri_h(_1, 1); }
 // CHECK-LABEL: @xvrotri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrotri_w(v8i32 _1) { return __builtin_lasx_xvrotri_w(_1, 1); }
 // CHECK-LABEL: @xvrotri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrotri_d(v4i64 _1) { return __builtin_lasx_xvrotri_d(_1, 1); }
 // CHECK-LABEL: @xvextl_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvextl_q_d(v4i64 _1) { return __builtin_lasx_xvextl_q_d(_1); }
 // CHECK-LABEL: @xvsrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xbnz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_b(v32u8 _1) { return __builtin_lasx_xbnz_b(_1); }
 // CHECK-LABEL: @xbnz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_d(v4u64 _1) { return __builtin_lasx_xbnz_d(_1); }
 // CHECK-LABEL: @xbnz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_h(v16u16 _1) { return __builtin_lasx_xbnz_h(_1); }
 // CHECK-LABEL: @xbnz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_v(v32u8 _1) { return __builtin_lasx_xbnz_v(_1); }
 // CHECK-LABEL: @xbnz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_w(v8u32 _1) { return __builtin_lasx_xbnz_w(_1); }
 // CHECK-LABEL: @xbz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_b(v32u8 _1) { return __builtin_lasx_xbz_b(_1); }
 // CHECK-LABEL: @xbz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_d(v4u64 _1) { return __builtin_lasx_xbz_d(_1); }
 // CHECK-LABEL: @xbz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_h(v16u16 _1) { return __builtin_lasx_xbz_h(_1); }
 // CHECK-LABEL: @xbz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_v(v32u8 _1) { return __builtin_lasx_xbz_v(_1); }
 // CHECK-LABEL: @xbz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_w(v8u32 _1) { return __builtin_lasx_xbz_w(_1); }
 // CHECK-LABEL: @xvfcmp_caf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_caf_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_caf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_caf_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_ceq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_ceq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_ceq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_ceq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cle_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cle_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_clt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_clt_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_clt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_clt_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cne_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cne_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cor_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cor_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cueq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cueq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cule_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cule_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cult_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cult_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cun_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cune_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cune_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cun_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_saf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_saf_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_saf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_saf_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_seq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_seq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_seq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_seq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sle_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sle_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_slt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_slt_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_slt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_slt_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sne_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sne_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sor_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sor_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sueq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sueq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sule_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sule_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sult_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sult_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sun_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sune_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sune_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sun_s(_1, _2); }
 // CHECK-LABEL: @xvpickve_d_f(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvpickve_d_f(v4f64 _1) { return __builtin_lasx_xvpickve_d_f(_1, 1); }
 // CHECK-LABEL: @xvpickve_w_f(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvpickve_w_f(v8f32 _1) { return __builtin_lasx_xvpickve_w_f(_1, 1); }
 // CHECK-LABEL: @xvrepli_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrepli_b() { return __builtin_lasx_xvrepli_b(1); }
 // CHECK-LABEL: @xvrepli_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrepli_d() { return __builtin_lasx_xvrepli_d(1); }
 // CHECK-LABEL: @xvrepli_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); }
 // CHECK-LABEL: @xvrepli_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __builtin_lasx_xvrepli_w(1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
index 331e29fb7d17f..7a84e0ae24f95 100644
--- a/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
@@ -5,4080 +5,5838 @@
 
 // CHECK-LABEL: @vsll_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsll_b(v16i8 _1, v16i8 _2) { return __lsx_vsll_b(_1, _2); }
 // CHECK-LABEL: @vsll_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsll_h(v8i16 _1, v8i16 _2) { return __lsx_vsll_h(_1, _2); }
 // CHECK-LABEL: @vsll_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsll_w(v4i32 _1, v4i32 _2) { return __lsx_vsll_w(_1, _2); }
 // CHECK-LABEL: @vsll_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsll_d(v2i64 _1, v2i64 _2) { return __lsx_vsll_d(_1, _2); }
 // CHECK-LABEL: @vslli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslli_b(v16i8 _1) { return __lsx_vslli_b(_1, 1); }
 // CHECK-LABEL: @vslli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslli_h(v8i16 _1) { return __lsx_vslli_h(_1, 1); }
 // CHECK-LABEL: @vslli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslli_w(v4i32 _1) { return __lsx_vslli_w(_1, 1); }
 // CHECK-LABEL: @vslli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslli_d(v2i64 _1) { return __lsx_vslli_d(_1, 1); }
 // CHECK-LABEL: @vsra_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsra_b(v16i8 _1, v16i8 _2) { return __lsx_vsra_b(_1, _2); }
 // CHECK-LABEL: @vsra_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsra_h(v8i16 _1, v8i16 _2) { return __lsx_vsra_h(_1, _2); }
 // CHECK-LABEL: @vsra_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsra_w(v4i32 _1, v4i32 _2) { return __lsx_vsra_w(_1, _2); }
 // CHECK-LABEL: @vsra_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsra_d(v2i64 _1, v2i64 _2) { return __lsx_vsra_d(_1, _2); }
 // CHECK-LABEL: @vsrai_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrai_b(v16i8 _1) { return __lsx_vsrai_b(_1, 1); }
 // CHECK-LABEL: @vsrai_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrai_h(v8i16 _1) { return __lsx_vsrai_h(_1, 1); }
 // CHECK-LABEL: @vsrai_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrai_w(v4i32 _1) { return __lsx_vsrai_w(_1, 1); }
 // CHECK-LABEL: @vsrai_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrai_d(v2i64 _1) { return __lsx_vsrai_d(_1, 1); }
 // CHECK-LABEL: @vsrar_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrar_b(v16i8 _1, v16i8 _2) { return __lsx_vsrar_b(_1, _2); }
 // CHECK-LABEL: @vsrar_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrar_h(v8i16 _1, v8i16 _2) { return __lsx_vsrar_h(_1, _2); }
 // CHECK-LABEL: @vsrar_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrar_w(v4i32 _1, v4i32 _2) { return __lsx_vsrar_w(_1, _2); }
 // CHECK-LABEL: @vsrar_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrar_d(v2i64 _1, v2i64 _2) { return __lsx_vsrar_d(_1, _2); }
 // CHECK-LABEL: @vsrari_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrari_b(v16i8 _1) { return __lsx_vsrari_b(_1, 1); }
 // CHECK-LABEL: @vsrari_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrari_h(v8i16 _1) { return __lsx_vsrari_h(_1, 1); }
 // CHECK-LABEL: @vsrari_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrari_w(v4i32 _1) { return __lsx_vsrari_w(_1, 1); }
 // CHECK-LABEL: @vsrari_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrari_d(v2i64 _1) { return __lsx_vsrari_d(_1, 1); }
 // CHECK-LABEL: @vsrl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrl_b(v16i8 _1, v16i8 _2) { return __lsx_vsrl_b(_1, _2); }
 // CHECK-LABEL: @vsrl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrl_h(v8i16 _1, v8i16 _2) { return __lsx_vsrl_h(_1, _2); }
 // CHECK-LABEL: @vsrl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrl_w(v4i32 _1, v4i32 _2) { return __lsx_vsrl_w(_1, _2); }
 // CHECK-LABEL: @vsrl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrl_d(v2i64 _1, v2i64 _2) { return __lsx_vsrl_d(_1, _2); }
 // CHECK-LABEL: @vsrli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrli_b(v16i8 _1) { return __lsx_vsrli_b(_1, 1); }
 // CHECK-LABEL: @vsrli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrli_h(v8i16 _1) { return __lsx_vsrli_h(_1, 1); }
 // CHECK-LABEL: @vsrli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrli_w(v4i32 _1) { return __lsx_vsrli_w(_1, 1); }
 // CHECK-LABEL: @vsrli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrli_d(v2i64 _1) { return __lsx_vsrli_d(_1, 1); }
 // CHECK-LABEL: @vsrlr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlr_b(v16i8 _1, v16i8 _2) { return __lsx_vsrlr_b(_1, _2); }
 // CHECK-LABEL: @vsrlr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlr_h(v8i16 _1, v8i16 _2) { return __lsx_vsrlr_h(_1, _2); }
 // CHECK-LABEL: @vsrlr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlr_w(v4i32 _1, v4i32 _2) { return __lsx_vsrlr_w(_1, _2); }
 // CHECK-LABEL: @vsrlr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlr_d(v2i64 _1, v2i64 _2) { return __lsx_vsrlr_d(_1, _2); }
 // CHECK-LABEL: @vsrlri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrlri_b(v16i8 _1) { return __lsx_vsrlri_b(_1, 1); }
 // CHECK-LABEL: @vsrlri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrlri_h(v8i16 _1) { return __lsx_vsrlri_h(_1, 1); }
 // CHECK-LABEL: @vsrlri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrlri_w(v4i32 _1) { return __lsx_vsrlri_w(_1, 1); }
 // CHECK-LABEL: @vsrlri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrlri_d(v2i64 _1) { return __lsx_vsrlri_d(_1, 1); }
 // CHECK-LABEL: @vbitclr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitclr_b(v16u8 _1, v16u8 _2) { return __lsx_vbitclr_b(_1, _2); }
 // CHECK-LABEL: @vbitclr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitclr_h(v8u16 _1, v8u16 _2) { return __lsx_vbitclr_h(_1, _2); }
 // CHECK-LABEL: @vbitclr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitclr_w(v4u32 _1, v4u32 _2) { return __lsx_vbitclr_w(_1, _2); }
 // CHECK-LABEL: @vbitclr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitclr_d(v2u64 _1, v2u64 _2) { return __lsx_vbitclr_d(_1, _2); }
 // CHECK-LABEL: @vbitclri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitclri_b(v16u8 _1) { return __lsx_vbitclri_b(_1, 1); }
 // CHECK-LABEL: @vbitclri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitclri_h(v8u16 _1) { return __lsx_vbitclri_h(_1, 1); }
 // CHECK-LABEL: @vbitclri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitclri_w(v4u32 _1) { return __lsx_vbitclri_w(_1, 1); }
 // CHECK-LABEL: @vbitclri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitclri_d(v2u64 _1) { return __lsx_vbitclri_d(_1, 1); }
 // CHECK-LABEL: @vbitset_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitset_b(v16u8 _1, v16u8 _2) { return __lsx_vbitset_b(_1, _2); }
 // CHECK-LABEL: @vbitset_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitset_h(v8u16 _1, v8u16 _2) { return __lsx_vbitset_h(_1, _2); }
 // CHECK-LABEL: @vbitset_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitset_w(v4u32 _1, v4u32 _2) { return __lsx_vbitset_w(_1, _2); }
 // CHECK-LABEL: @vbitset_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitset_d(v2u64 _1, v2u64 _2) { return __lsx_vbitset_d(_1, _2); }
 // CHECK-LABEL: @vbitseti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitseti_b(v16u8 _1) { return __lsx_vbitseti_b(_1, 1); }
 // CHECK-LABEL: @vbitseti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitseti_h(v8u16 _1) { return __lsx_vbitseti_h(_1, 1); }
 // CHECK-LABEL: @vbitseti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitseti_w(v4u32 _1) { return __lsx_vbitseti_w(_1, 1); }
 // CHECK-LABEL: @vbitseti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitseti_d(v2u64 _1) { return __lsx_vbitseti_d(_1, 1); }
 // CHECK-LABEL: @vbitrev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitrev_b(v16u8 _1, v16u8 _2) { return __lsx_vbitrev_b(_1, _2); }
 // CHECK-LABEL: @vbitrev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitrev_h(v8u16 _1, v8u16 _2) { return __lsx_vbitrev_h(_1, _2); }
 // CHECK-LABEL: @vbitrev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitrev_w(v4u32 _1, v4u32 _2) { return __lsx_vbitrev_w(_1, _2); }
 // CHECK-LABEL: @vbitrev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitrev_d(v2u64 _1, v2u64 _2) { return __lsx_vbitrev_d(_1, _2); }
 // CHECK-LABEL: @vbitrevi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitrevi_b(v16u8 _1) { return __lsx_vbitrevi_b(_1, 1); }
 // CHECK-LABEL: @vbitrevi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitrevi_h(v8u16 _1) { return __lsx_vbitrevi_h(_1, 1); }
 // CHECK-LABEL: @vbitrevi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitrevi_w(v4u32 _1) { return __lsx_vbitrevi_w(_1, 1); }
 // CHECK-LABEL: @vbitrevi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitrevi_d(v2u64 _1) { return __lsx_vbitrevi_d(_1, 1); }
 // CHECK-LABEL: @vadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vadd_b(v16i8 _1, v16i8 _2) { return __lsx_vadd_b(_1, _2); }
 // CHECK-LABEL: @vadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vadd_h(v8i16 _1, v8i16 _2) { return __lsx_vadd_h(_1, _2); }
 // CHECK-LABEL: @vadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vadd_w(v4i32 _1, v4i32 _2) { return __lsx_vadd_w(_1, _2); }
 // CHECK-LABEL: @vadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadd_d(v2i64 _1, v2i64 _2) { return __lsx_vadd_d(_1, _2); }
 // CHECK-LABEL: @vaddi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vaddi_bu(v16i8 _1) { return __lsx_vaddi_bu(_1, 1); }
 // CHECK-LABEL: @vaddi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vaddi_hu(v8i16 _1) { return __lsx_vaddi_hu(_1, 1); }
 // CHECK-LABEL: @vaddi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vaddi_wu(v4i32 _1) { return __lsx_vaddi_wu(_1, 1); }
 // CHECK-LABEL: @vaddi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vaddi_du(v2i64 _1) { return __lsx_vaddi_du(_1, 1); }
 // CHECK-LABEL: @vsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsub_b(v16i8 _1, v16i8 _2) { return __lsx_vsub_b(_1, _2); }
 // CHECK-LABEL: @vsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsub_h(v8i16 _1, v8i16 _2) { return __lsx_vsub_h(_1, _2); }
 // CHECK-LABEL: @vsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsub_w(v4i32 _1, v4i32 _2) { return __lsx_vsub_w(_1, _2); }
 // CHECK-LABEL: @vsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsub_d(v2i64 _1, v2i64 _2) { return __lsx_vsub_d(_1, _2); }
 // CHECK-LABEL: @vsubi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsubi_bu(v16i8 _1) { return __lsx_vsubi_bu(_1, 1); }
 // CHECK-LABEL: @vsubi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsubi_hu(v8i16 _1) { return __lsx_vsubi_hu(_1, 1); }
 // CHECK-LABEL: @vsubi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsubi_wu(v4i32 _1) { return __lsx_vsubi_wu(_1, 1); }
 // CHECK-LABEL: @vsubi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsubi_du(v2i64 _1) { return __lsx_vsubi_du(_1, 1); }
 // CHECK-LABEL: @vmax_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmax_b(v16i8 _1, v16i8 _2) { return __lsx_vmax_b(_1, _2); }
 // CHECK-LABEL: @vmax_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmax_h(v8i16 _1, v8i16 _2) { return __lsx_vmax_h(_1, _2); }
 // CHECK-LABEL: @vmax_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmax_w(v4i32 _1, v4i32 _2) { return __lsx_vmax_w(_1, _2); }
 // CHECK-LABEL: @vmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmax_d(v2i64 _1, v2i64 _2) { return __lsx_vmax_d(_1, _2); }
 // CHECK-LABEL: @vmaxi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmaxi_b(v16i8 _1) { return __lsx_vmaxi_b(_1, 1); }
 // CHECK-LABEL: @vmaxi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmaxi_h(v8i16 _1) { return __lsx_vmaxi_h(_1, 1); }
 // CHECK-LABEL: @vmaxi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmaxi_w(v4i32 _1) { return __lsx_vmaxi_w(_1, 1); }
 // CHECK-LABEL: @vmaxi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmaxi_d(v2i64 _1) { return __lsx_vmaxi_d(_1, 1); }
 // CHECK-LABEL: @vmax_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmax_bu(v16u8 _1, v16u8 _2) { return __lsx_vmax_bu(_1, _2); }
 // CHECK-LABEL: @vmax_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmax_hu(v8u16 _1, v8u16 _2) { return __lsx_vmax_hu(_1, _2); }
 // CHECK-LABEL: @vmax_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmax_wu(v4u32 _1, v4u32 _2) { return __lsx_vmax_wu(_1, _2); }
 // CHECK-LABEL: @vmax_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmax_du(v2u64 _1, v2u64 _2) { return __lsx_vmax_du(_1, _2); }
 // CHECK-LABEL: @vmaxi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vmaxi_bu(v16u8 _1) { return __lsx_vmaxi_bu(_1, 1); }
 // CHECK-LABEL: @vmaxi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vmaxi_hu(v8u16 _1) { return __lsx_vmaxi_hu(_1, 1); }
 // CHECK-LABEL: @vmaxi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vmaxi_wu(v4u32 _1) { return __lsx_vmaxi_wu(_1, 1); }
 // CHECK-LABEL: @vmaxi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vmaxi_du(v2u64 _1) { return __lsx_vmaxi_du(_1, 1); }
 // CHECK-LABEL: @vmin_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmin_b(v16i8 _1, v16i8 _2) { return __lsx_vmin_b(_1, _2); }
 // CHECK-LABEL: @vmin_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmin_h(v8i16 _1, v8i16 _2) { return __lsx_vmin_h(_1, _2); }
 // CHECK-LABEL: @vmin_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmin_w(v4i32 _1, v4i32 _2) { return __lsx_vmin_w(_1, _2); }
 // CHECK-LABEL: @vmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmin_d(v2i64 _1, v2i64 _2) { return __lsx_vmin_d(_1, _2); }
 // CHECK-LABEL: @vmini_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmini_b(v16i8 _1) { return __lsx_vmini_b(_1, 1); }
 // CHECK-LABEL: @vmini_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmini_h(v8i16 _1) { return __lsx_vmini_h(_1, 1); }
 // CHECK-LABEL: @vmini_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmini_w(v4i32 _1) { return __lsx_vmini_w(_1, 1); }
 // CHECK-LABEL: @vmini_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmini_d(v2i64 _1) { return __lsx_vmini_d(_1, 1); }
 // CHECK-LABEL: @vmin_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmin_bu(v16u8 _1, v16u8 _2) { return __lsx_vmin_bu(_1, _2); }
 // CHECK-LABEL: @vmin_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmin_hu(v8u16 _1, v8u16 _2) { return __lsx_vmin_hu(_1, _2); }
 // CHECK-LABEL: @vmin_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmin_wu(v4u32 _1, v4u32 _2) { return __lsx_vmin_wu(_1, _2); }
 // CHECK-LABEL: @vmin_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmin_du(v2u64 _1, v2u64 _2) { return __lsx_vmin_du(_1, _2); }
 // CHECK-LABEL: @vmini_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vmini_bu(v16u8 _1) { return __lsx_vmini_bu(_1, 1); }
 // CHECK-LABEL: @vmini_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vmini_hu(v8u16 _1) { return __lsx_vmini_hu(_1, 1); }
 // CHECK-LABEL: @vmini_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vmini_wu(v4u32 _1) { return __lsx_vmini_wu(_1, 1); }
 // CHECK-LABEL: @vmini_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vmini_du(v2u64 _1) { return __lsx_vmini_du(_1, 1); }
 // CHECK-LABEL: @vseq_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vseq_b(v16i8 _1, v16i8 _2) { return __lsx_vseq_b(_1, _2); }
 // CHECK-LABEL: @vseq_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vseq_h(v8i16 _1, v8i16 _2) { return __lsx_vseq_h(_1, _2); }
 // CHECK-LABEL: @vseq_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vseq_w(v4i32 _1, v4i32 _2) { return __lsx_vseq_w(_1, _2); }
 // CHECK-LABEL: @vseq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vseq_d(v2i64 _1, v2i64 _2) { return __lsx_vseq_d(_1, _2); }
 // CHECK-LABEL: @vseqi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vseqi_b(v16i8 _1) { return __lsx_vseqi_b(_1, 1); }
 // CHECK-LABEL: @vseqi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vseqi_h(v8i16 _1) { return __lsx_vseqi_h(_1, 1); }
 // CHECK-LABEL: @vseqi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vseqi_w(v4i32 _1) { return __lsx_vseqi_w(_1, 1); }
 // CHECK-LABEL: @vseqi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vseqi_d(v2i64 _1) { return __lsx_vseqi_d(_1, 1); }
 // CHECK-LABEL: @vslti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslti_b(v16i8 _1) { return __lsx_vslti_b(_1, 1); }
 // CHECK-LABEL: @vslt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vslt_b(v16i8 _1, v16i8 _2) { return __lsx_vslt_b(_1, _2); }
 // CHECK-LABEL: @vslt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vslt_h(v8i16 _1, v8i16 _2) { return __lsx_vslt_h(_1, _2); }
 // CHECK-LABEL: @vslt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vslt_w(v4i32 _1, v4i32 _2) { return __lsx_vslt_w(_1, _2); }
 // CHECK-LABEL: @vslt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vslt_d(v2i64 _1, v2i64 _2) { return __lsx_vslt_d(_1, _2); }
 // CHECK-LABEL: @vslti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslti_h(v8i16 _1) { return __lsx_vslti_h(_1, 1); }
 // CHECK-LABEL: @vslti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslti_w(v4i32 _1) { return __lsx_vslti_w(_1, 1); }
 // CHECK-LABEL: @vslti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslti_d(v2i64 _1) { return __lsx_vslti_d(_1, 1); }
 // CHECK-LABEL: @vslt_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vslt_bu(v16u8 _1, v16u8 _2) { return __lsx_vslt_bu(_1, _2); }
 // CHECK-LABEL: @vslt_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vslt_hu(v8u16 _1, v8u16 _2) { return __lsx_vslt_hu(_1, _2); }
 // CHECK-LABEL: @vslt_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vslt_wu(v4u32 _1, v4u32 _2) { return __lsx_vslt_wu(_1, _2); }
 // CHECK-LABEL: @vslt_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vslt_du(v2u64 _1, v2u64 _2) { return __lsx_vslt_du(_1, _2); }
 // CHECK-LABEL: @vslti_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslti_bu(v16u8 _1) { return __lsx_vslti_bu(_1, 1); }
 // CHECK-LABEL: @vslti_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslti_hu(v8u16 _1) { return __lsx_vslti_hu(_1, 1); }
 // CHECK-LABEL: @vslti_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslti_wu(v4u32 _1) { return __lsx_vslti_wu(_1, 1); }
 // CHECK-LABEL: @vslti_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslti_du(v2u64 _1) { return __lsx_vslti_du(_1, 1); }
 // CHECK-LABEL: @vsle_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsle_b(v16i8 _1, v16i8 _2) { return __lsx_vsle_b(_1, _2); }
 // CHECK-LABEL: @vsle_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsle_h(v8i16 _1, v8i16 _2) { return __lsx_vsle_h(_1, _2); }
 // CHECK-LABEL: @vsle_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsle_w(v4i32 _1, v4i32 _2) { return __lsx_vsle_w(_1, _2); }
 // CHECK-LABEL: @vsle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsle_d(v2i64 _1, v2i64 _2) { return __lsx_vsle_d(_1, _2); }
 // CHECK-LABEL: @vslei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslei_b(v16i8 _1) { return __lsx_vslei_b(_1, 1); }
 // CHECK-LABEL: @vslei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslei_h(v8i16 _1) { return __lsx_vslei_h(_1, 1); }
 // CHECK-LABEL: @vslei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslei_w(v4i32 _1) { return __lsx_vslei_w(_1, 1); }
 // CHECK-LABEL: @vslei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslei_d(v2i64 _1) { return __lsx_vslei_d(_1, 1); }
 // CHECK-LABEL: @vsle_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsle_bu(v16u8 _1, v16u8 _2) { return __lsx_vsle_bu(_1, _2); }
 // CHECK-LABEL: @vsle_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsle_hu(v8u16 _1, v8u16 _2) { return __lsx_vsle_hu(_1, _2); }
 // CHECK-LABEL: @vsle_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsle_wu(v4u32 _1, v4u32 _2) { return __lsx_vsle_wu(_1, _2); }
 // CHECK-LABEL: @vsle_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsle_du(v2u64 _1, v2u64 _2) { return __lsx_vsle_du(_1, _2); }
 // CHECK-LABEL: @vslei_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslei_bu(v16u8 _1) { return __lsx_vslei_bu(_1, 1); }
 // CHECK-LABEL: @vslei_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslei_hu(v8u16 _1) { return __lsx_vslei_hu(_1, 1); }
 // CHECK-LABEL: @vslei_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslei_wu(v4u32 _1) { return __lsx_vslei_wu(_1, 1); }
 // CHECK-LABEL: @vslei_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslei_du(v2u64 _1) { return __lsx_vslei_du(_1, 1); }
 // CHECK-LABEL: @vsat_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsat_b(v16i8 _1) { return __lsx_vsat_b(_1, 1); }
 // CHECK-LABEL: @vsat_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsat_h(v8i16 _1) { return __lsx_vsat_h(_1, 1); }
 // CHECK-LABEL: @vsat_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsat_w(v4i32 _1) { return __lsx_vsat_w(_1, 1); }
 // CHECK-LABEL: @vsat_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsat_d(v2i64 _1) { return __lsx_vsat_d(_1, 1); }
 // CHECK-LABEL: @vsat_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vsat_bu(v16u8 _1) { return __lsx_vsat_bu(_1, 1); }
 // CHECK-LABEL: @vsat_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vsat_hu(v8u16 _1) { return __lsx_vsat_hu(_1, 1); }
 // CHECK-LABEL: @vsat_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vsat_wu(v4u32 _1) { return __lsx_vsat_wu(_1, 1); }
 // CHECK-LABEL: @vsat_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vsat_du(v2u64 _1) { return __lsx_vsat_du(_1, 1); }
 // CHECK-LABEL: @vadda_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vadda_b(v16i8 _1, v16i8 _2) { return __lsx_vadda_b(_1, _2); }
 // CHECK-LABEL: @vadda_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vadda_h(v8i16 _1, v8i16 _2) { return __lsx_vadda_h(_1, _2); }
 // CHECK-LABEL: @vadda_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vadda_w(v4i32 _1, v4i32 _2) { return __lsx_vadda_w(_1, _2); }
 // CHECK-LABEL: @vadda_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadda_d(v2i64 _1, v2i64 _2) { return __lsx_vadda_d(_1, _2); }
 // CHECK-LABEL: @vsadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsadd_b(v16i8 _1, v16i8 _2) { return __lsx_vsadd_b(_1, _2); }
 // CHECK-LABEL: @vsadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsadd_h(v8i16 _1, v8i16 _2) { return __lsx_vsadd_h(_1, _2); }
 // CHECK-LABEL: @vsadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsadd_w(v4i32 _1, v4i32 _2) { return __lsx_vsadd_w(_1, _2); }
 // CHECK-LABEL: @vsadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsadd_d(v2i64 _1, v2i64 _2) { return __lsx_vsadd_d(_1, _2); }
 // CHECK-LABEL: @vsadd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vsadd_bu(v16u8 _1, v16u8 _2) { return __lsx_vsadd_bu(_1, _2); }
 // CHECK-LABEL: @vsadd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vsadd_hu(v8u16 _1, v8u16 _2) { return __lsx_vsadd_hu(_1, _2); }
 // CHECK-LABEL: @vsadd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vsadd_wu(v4u32 _1, v4u32 _2) { return __lsx_vsadd_wu(_1, _2); }
 // CHECK-LABEL: @vsadd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vsadd_du(v2u64 _1, v2u64 _2) { return __lsx_vsadd_du(_1, _2); }
 // CHECK-LABEL: @vavg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vavg_b(v16i8 _1, v16i8 _2) { return __lsx_vavg_b(_1, _2); }
 // CHECK-LABEL: @vavg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vavg_h(v8i16 _1, v8i16 _2) { return __lsx_vavg_h(_1, _2); }
 // CHECK-LABEL: @vavg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vavg_w(v4i32 _1, v4i32 _2) { return __lsx_vavg_w(_1, _2); }
 // CHECK-LABEL: @vavg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vavg_d(v2i64 _1, v2i64 _2) { return __lsx_vavg_d(_1, _2); }
 // CHECK-LABEL: @vavg_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vavg_bu(v16u8 _1, v16u8 _2) { return __lsx_vavg_bu(_1, _2); }
 // CHECK-LABEL: @vavg_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vavg_hu(v8u16 _1, v8u16 _2) { return __lsx_vavg_hu(_1, _2); }
 // CHECK-LABEL: @vavg_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vavg_wu(v4u32 _1, v4u32 _2) { return __lsx_vavg_wu(_1, _2); }
 // CHECK-LABEL: @vavg_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vavg_du(v2u64 _1, v2u64 _2) { return __lsx_vavg_du(_1, _2); }
 // CHECK-LABEL: @vavgr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vavgr_b(v16i8 _1, v16i8 _2) { return __lsx_vavgr_b(_1, _2); }
 // CHECK-LABEL: @vavgr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vavgr_h(v8i16 _1, v8i16 _2) { return __lsx_vavgr_h(_1, _2); }
 // CHECK-LABEL: @vavgr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vavgr_w(v4i32 _1, v4i32 _2) { return __lsx_vavgr_w(_1, _2); }
 // CHECK-LABEL: @vavgr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vavgr_d(v2i64 _1, v2i64 _2) { return __lsx_vavgr_d(_1, _2); }
 // CHECK-LABEL: @vavgr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vavgr_bu(v16u8 _1, v16u8 _2) { return __lsx_vavgr_bu(_1, _2); }
 // CHECK-LABEL: @vavgr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vavgr_hu(v8u16 _1, v8u16 _2) { return __lsx_vavgr_hu(_1, _2); }
 // CHECK-LABEL: @vavgr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vavgr_wu(v4u32 _1, v4u32 _2) { return __lsx_vavgr_wu(_1, _2); }
 // CHECK-LABEL: @vavgr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vavgr_du(v2u64 _1, v2u64 _2) { return __lsx_vavgr_du(_1, _2); }
 // CHECK-LABEL: @vssub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssub_b(v16i8 _1, v16i8 _2) { return __lsx_vssub_b(_1, _2); }
 // CHECK-LABEL: @vssub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssub_h(v8i16 _1, v8i16 _2) { return __lsx_vssub_h(_1, _2); }
 // CHECK-LABEL: @vssub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssub_w(v4i32 _1, v4i32 _2) { return __lsx_vssub_w(_1, _2); }
 // CHECK-LABEL: @vssub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssub_d(v2i64 _1, v2i64 _2) { return __lsx_vssub_d(_1, _2); }
 // CHECK-LABEL: @vssub_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssub_bu(v16u8 _1, v16u8 _2) { return __lsx_vssub_bu(_1, _2); }
 // CHECK-LABEL: @vssub_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssub_hu(v8u16 _1, v8u16 _2) { return __lsx_vssub_hu(_1, _2); }
 // CHECK-LABEL: @vssub_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssub_wu(v4u32 _1, v4u32 _2) { return __lsx_vssub_wu(_1, _2); }
 // CHECK-LABEL: @vssub_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssub_du(v2u64 _1, v2u64 _2) { return __lsx_vssub_du(_1, _2); }
 // CHECK-LABEL: @vabsd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vabsd_b(v16i8 _1, v16i8 _2) { return __lsx_vabsd_b(_1, _2); }
 // CHECK-LABEL: @vabsd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vabsd_h(v8i16 _1, v8i16 _2) { return __lsx_vabsd_h(_1, _2); }
 // CHECK-LABEL: @vabsd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vabsd_w(v4i32 _1, v4i32 _2) { return __lsx_vabsd_w(_1, _2); }
 // CHECK-LABEL: @vabsd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vabsd_d(v2i64 _1, v2i64 _2) { return __lsx_vabsd_d(_1, _2); }
 // CHECK-LABEL: @vabsd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vabsd_bu(v16u8 _1, v16u8 _2) { return __lsx_vabsd_bu(_1, _2); }
 // CHECK-LABEL: @vabsd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vabsd_hu(v8u16 _1, v8u16 _2) { return __lsx_vabsd_hu(_1, _2); }
 // CHECK-LABEL: @vabsd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vabsd_wu(v4u32 _1, v4u32 _2) { return __lsx_vabsd_wu(_1, _2); }
 // CHECK-LABEL: @vabsd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vabsd_du(v2u64 _1, v2u64 _2) { return __lsx_vabsd_du(_1, _2); }
 // CHECK-LABEL: @vmul_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmul_b(v16i8 _1, v16i8 _2) { return __lsx_vmul_b(_1, _2); }
 // CHECK-LABEL: @vmul_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmul_h(v8i16 _1, v8i16 _2) { return __lsx_vmul_h(_1, _2); }
 // CHECK-LABEL: @vmul_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmul_w(v4i32 _1, v4i32 _2) { return __lsx_vmul_w(_1, _2); }
 // CHECK-LABEL: @vmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmul_d(v2i64 _1, v2i64 _2) { return __lsx_vmul_d(_1, _2); }
 // CHECK-LABEL: @vmadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vmadd_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vmadd_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmadd_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vmadd_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmadd_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vmadd_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmadd_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vmsub_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vmsub_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmsub_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vmsub_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmsub_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vmsub_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmsub_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vdiv_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vdiv_b(v16i8 _1, v16i8 _2) { return __lsx_vdiv_b(_1, _2); }
 // CHECK-LABEL: @vdiv_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vdiv_h(v8i16 _1, v8i16 _2) { return __lsx_vdiv_h(_1, _2); }
 // CHECK-LABEL: @vdiv_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vdiv_w(v4i32 _1, v4i32 _2) { return __lsx_vdiv_w(_1, _2); }
 // CHECK-LABEL: @vdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vdiv_d(v2i64 _1, v2i64 _2) { return __lsx_vdiv_d(_1, _2); }
 // CHECK-LABEL: @vdiv_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vdiv_bu(v16u8 _1, v16u8 _2) { return __lsx_vdiv_bu(_1, _2); }
 // CHECK-LABEL: @vdiv_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vdiv_hu(v8u16 _1, v8u16 _2) { return __lsx_vdiv_hu(_1, _2); }
 // CHECK-LABEL: @vdiv_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vdiv_wu(v4u32 _1, v4u32 _2) { return __lsx_vdiv_wu(_1, _2); }
 // CHECK-LABEL: @vdiv_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vdiv_du(v2u64 _1, v2u64 _2) { return __lsx_vdiv_du(_1, _2); }
 // CHECK-LABEL: @vhaddw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhaddw_h_b(v16i8 _1, v16i8 _2) { return __lsx_vhaddw_h_b(_1, _2); }
 // CHECK-LABEL: @vhaddw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhaddw_w_h(v8i16 _1, v8i16 _2) { return __lsx_vhaddw_w_h(_1, _2); }
 // CHECK-LABEL: @vhaddw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhaddw_d_w(v4i32 _1, v4i32 _2) { return __lsx_vhaddw_d_w(_1, _2); }
 // CHECK-LABEL: @vhaddw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vhaddw_hu_bu(v16u8 _1, v16u8 _2) { return __lsx_vhaddw_hu_bu(_1, _2); }
 // CHECK-LABEL: @vhaddw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vhaddw_wu_hu(v8u16 _1, v8u16 _2) { return __lsx_vhaddw_wu_hu(_1, _2); }
 // CHECK-LABEL: @vhaddw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhaddw_du_wu(v4u32 _1, v4u32 _2) { return __lsx_vhaddw_du_wu(_1, _2); }
 // CHECK-LABEL: @vhsubw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhsubw_h_b(v16i8 _1, v16i8 _2) { return __lsx_vhsubw_h_b(_1, _2); }
 // CHECK-LABEL: @vhsubw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhsubw_w_h(v8i16 _1, v8i16 _2) { return __lsx_vhsubw_w_h(_1, _2); }
 // CHECK-LABEL: @vhsubw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_d_w(v4i32 _1, v4i32 _2) { return __lsx_vhsubw_d_w(_1, _2); }
 // CHECK-LABEL: @vhsubw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhsubw_hu_bu(v16u8 _1, v16u8 _2) { return __lsx_vhsubw_hu_bu(_1, _2); }
 // CHECK-LABEL: @vhsubw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhsubw_wu_hu(v8u16 _1, v8u16 _2) { return __lsx_vhsubw_wu_hu(_1, _2); }
 // CHECK-LABEL: @vhsubw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_du_wu(v4u32 _1, v4u32 _2) { return __lsx_vhsubw_du_wu(_1, _2); }
 // CHECK-LABEL: @vmod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmod_b(v16i8 _1, v16i8 _2) { return __lsx_vmod_b(_1, _2); }
 // CHECK-LABEL: @vmod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmod_h(v8i16 _1, v8i16 _2) { return __lsx_vmod_h(_1, _2); }
 // CHECK-LABEL: @vmod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmod_w(v4i32 _1, v4i32 _2) { return __lsx_vmod_w(_1, _2); }
 // CHECK-LABEL: @vmod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmod_d(v2i64 _1, v2i64 _2) { return __lsx_vmod_d(_1, _2); }
 // CHECK-LABEL: @vmod_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmod_bu(v16u8 _1, v16u8 _2) { return __lsx_vmod_bu(_1, _2); }
 // CHECK-LABEL: @vmod_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmod_hu(v8u16 _1, v8u16 _2) { return __lsx_vmod_hu(_1, _2); }
 // CHECK-LABEL: @vmod_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmod_wu(v4u32 _1, v4u32 _2) { return __lsx_vmod_wu(_1, _2); }
 // CHECK-LABEL: @vmod_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmod_du(v2u64 _1, v2u64 _2) { return __lsx_vmod_du(_1, _2); }
 // CHECK-LABEL: @vreplve_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vreplve_b(v16i8 _1, int _2) { return __lsx_vreplve_b(_1, _2); }
 // CHECK-LABEL: @vreplve_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vreplve_h(v8i16 _1, int _2) { return __lsx_vreplve_h(_1, _2); }
 // CHECK-LABEL: @vreplve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vreplve_w(v4i32 _1, int _2) { return __lsx_vreplve_w(_1, _2); }
 // CHECK-LABEL: @vreplve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vreplve_d(v2i64 _1, int _2) { return __lsx_vreplve_d(_1, _2); }
 // CHECK-LABEL: @vreplvei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vreplvei_b(v16i8 _1) { return __lsx_vreplvei_b(_1, 1); }
 // CHECK-LABEL: @vreplvei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vreplvei_h(v8i16 _1) { return __lsx_vreplvei_h(_1, 1); }
 // CHECK-LABEL: @vreplvei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vreplvei_w(v4i32 _1) { return __lsx_vreplvei_w(_1, 1); }
 // CHECK-LABEL: @vreplvei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vreplvei_d(v2i64 _1) { return __lsx_vreplvei_d(_1, 1); }
 // CHECK-LABEL: @vpickev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpickev_b(v16i8 _1, v16i8 _2) { return __lsx_vpickev_b(_1, _2); }
 // CHECK-LABEL: @vpickev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpickev_h(v8i16 _1, v8i16 _2) { return __lsx_vpickev_h(_1, _2); }
 // CHECK-LABEL: @vpickev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpickev_w(v4i32 _1, v4i32 _2) { return __lsx_vpickev_w(_1, _2); }
 // CHECK-LABEL: @vpickev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpickev_d(v2i64 _1, v2i64 _2) { return __lsx_vpickev_d(_1, _2); }
 // CHECK-LABEL: @vpickod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpickod_b(v16i8 _1, v16i8 _2) { return __lsx_vpickod_b(_1, _2); }
 // CHECK-LABEL: @vpickod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpickod_h(v8i16 _1, v8i16 _2) { return __lsx_vpickod_h(_1, _2); }
 // CHECK-LABEL: @vpickod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpickod_w(v4i32 _1, v4i32 _2) { return __lsx_vpickod_w(_1, _2); }
 // CHECK-LABEL: @vpickod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpickod_d(v2i64 _1, v2i64 _2) { return __lsx_vpickod_d(_1, _2); }
 // CHECK-LABEL: @vilvh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vilvh_b(v16i8 _1, v16i8 _2) { return __lsx_vilvh_b(_1, _2); }
 // CHECK-LABEL: @vilvh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vilvh_h(v8i16 _1, v8i16 _2) { return __lsx_vilvh_h(_1, _2); }
 // CHECK-LABEL: @vilvh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vilvh_w(v4i32 _1, v4i32 _2) { return __lsx_vilvh_w(_1, _2); }
 // CHECK-LABEL: @vilvh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vilvh_d(v2i64 _1, v2i64 _2) { return __lsx_vilvh_d(_1, _2); }
 // CHECK-LABEL: @vilvl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vilvl_b(v16i8 _1, v16i8 _2) { return __lsx_vilvl_b(_1, _2); }
 // CHECK-LABEL: @vilvl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vilvl_h(v8i16 _1, v8i16 _2) { return __lsx_vilvl_h(_1, _2); }
 // CHECK-LABEL: @vilvl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vilvl_w(v4i32 _1, v4i32 _2) { return __lsx_vilvl_w(_1, _2); }
 // CHECK-LABEL: @vilvl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vilvl_d(v2i64 _1, v2i64 _2) { return __lsx_vilvl_d(_1, _2); }
 // CHECK-LABEL: @vpackev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpackev_b(v16i8 _1, v16i8 _2) { return __lsx_vpackev_b(_1, _2); }
 // CHECK-LABEL: @vpackev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpackev_h(v8i16 _1, v8i16 _2) { return __lsx_vpackev_h(_1, _2); }
 // CHECK-LABEL: @vpackev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpackev_w(v4i32 _1, v4i32 _2) { return __lsx_vpackev_w(_1, _2); }
 // CHECK-LABEL: @vpackev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpackev_d(v2i64 _1, v2i64 _2) { return __lsx_vpackev_d(_1, _2); }
 // CHECK-LABEL: @vpackod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpackod_b(v16i8 _1, v16i8 _2) { return __lsx_vpackod_b(_1, _2); }
 // CHECK-LABEL: @vpackod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpackod_h(v8i16 _1, v8i16 _2) { return __lsx_vpackod_h(_1, _2); }
 // CHECK-LABEL: @vpackod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpackod_w(v4i32 _1, v4i32 _2) { return __lsx_vpackod_w(_1, _2); }
 // CHECK-LABEL: @vpackod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpackod_d(v2i64 _1, v2i64 _2) { return __lsx_vpackod_d(_1, _2); }
 // CHECK-LABEL: @vshuf_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vshuf_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vshuf_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vshuf_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vshuf_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vshuf_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vshuf_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vand_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vand_v(v16u8 _1, v16u8 _2) { return __lsx_vand_v(_1, _2); }
 // CHECK-LABEL: @vandi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vandi_b(v16u8 _1) { return __lsx_vandi_b(_1, 1); }
 // CHECK-LABEL: @vor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vor_v(v16u8 _1, v16u8 _2) { return __lsx_vor_v(_1, _2); }
 // CHECK-LABEL: @vori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vori_b(v16u8 _1) { return __lsx_vori_b(_1, 1); }
 // CHECK-LABEL: @vnor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vnor_v(v16u8 _1, v16u8 _2) { return __lsx_vnor_v(_1, _2); }
 // CHECK-LABEL: @vnori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vnori_b(v16u8 _1) { return __lsx_vnori_b(_1, 1); }
 // CHECK-LABEL: @vxor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vxor_v(v16u8 _1, v16u8 _2) { return __lsx_vxor_v(_1, _2); }
 // CHECK-LABEL: @vxori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vxori_b(v16u8 _1) { return __lsx_vxori_b(_1, 1); }
 // CHECK-LABEL: @vbitsel_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16u8 vbitsel_v(v16u8 _1, v16u8 _2, v16u8 _3) {
   return __lsx_vbitsel_v(_1, _2, _3);
 }
 // CHECK-LABEL: @vbitseli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitseli_b(v16u8 _1, v16u8 _2) { return __lsx_vbitseli_b(_1, _2, 1); }
 // CHECK-LABEL: @vshuf4i_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vshuf4i_b(v16i8 _1) { return __lsx_vshuf4i_b(_1, 1); }
 // CHECK-LABEL: @vshuf4i_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vshuf4i_h(v8i16 _1) { return __lsx_vshuf4i_h(_1, 1); }
 // CHECK-LABEL: @vshuf4i_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vshuf4i_w(v4i32 _1) { return __lsx_vshuf4i_w(_1, 1); }
 // CHECK-LABEL: @vreplgr2vr_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vreplgr2vr_b(int _1) { return __lsx_vreplgr2vr_b(_1); }
 // CHECK-LABEL: @vreplgr2vr_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vreplgr2vr_h(int _1) { return __lsx_vreplgr2vr_h(_1); }
 // CHECK-LABEL: @vreplgr2vr_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vreplgr2vr_w(int _1) { return __lsx_vreplgr2vr_w(_1); }
 // CHECK-LABEL: @vreplgr2vr_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vreplgr2vr_d(long _1) { return __lsx_vreplgr2vr_d(_1); }
 // CHECK-LABEL: @vpcnt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vpcnt_b(v16i8 _1) { return __lsx_vpcnt_b(_1); }
 // CHECK-LABEL: @vpcnt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vpcnt_h(v8i16 _1) { return __lsx_vpcnt_h(_1); }
 // CHECK-LABEL: @vpcnt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vpcnt_w(v4i32 _1) { return __lsx_vpcnt_w(_1); }
 // CHECK-LABEL: @vpcnt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vpcnt_d(v2i64 _1) { return __lsx_vpcnt_d(_1); }
 // CHECK-LABEL: @vclo_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vclo_b(v16i8 _1) { return __lsx_vclo_b(_1); }
 // CHECK-LABEL: @vclo_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vclo_h(v8i16 _1) { return __lsx_vclo_h(_1); }
 // CHECK-LABEL: @vclo_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vclo_w(v4i32 _1) { return __lsx_vclo_w(_1); }
 // CHECK-LABEL: @vclo_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vclo_d(v2i64 _1) { return __lsx_vclo_d(_1); }
 // CHECK-LABEL: @vclz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vclz_b(v16i8 _1) { return __lsx_vclz_b(_1); }
 // CHECK-LABEL: @vclz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vclz_h(v8i16 _1) { return __lsx_vclz_h(_1); }
 // CHECK-LABEL: @vclz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vclz_w(v4i32 _1) { return __lsx_vclz_w(_1); }
 // CHECK-LABEL: @vclz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vclz_d(v2i64 _1) { return __lsx_vclz_d(_1); }
 // CHECK-LABEL: @vpickve2gr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_b(v16i8 _1) { return __lsx_vpickve2gr_b(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_h(v8i16 _1) { return __lsx_vpickve2gr_h(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_w(v4i32 _1) { return __lsx_vpickve2gr_w(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long vpickve2gr_d(v2i64 _1) { return __lsx_vpickve2gr_d(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_bu(v16i8 _1) { return __lsx_vpickve2gr_bu(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_hu(v8i16 _1) { return __lsx_vpickve2gr_hu(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_wu(v4i32 _1) { return __lsx_vpickve2gr_wu(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int vpickve2gr_du(v2i64 _1) { return __lsx_vpickve2gr_du(_1, 1); }
 // CHECK-LABEL: @vinsgr2vr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vinsgr2vr_b(v16i8 _1) { return __lsx_vinsgr2vr_b(_1, 1, 1); }
 // CHECK-LABEL: @vinsgr2vr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vinsgr2vr_h(v8i16 _1) { return __lsx_vinsgr2vr_h(_1, 1, 1); }
 // CHECK-LABEL: @vinsgr2vr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vinsgr2vr_w(v4i32 _1) { return __lsx_vinsgr2vr_w(_1, 1, 1); }
 // CHECK-LABEL: @vinsgr2vr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[_1:%.*]], i64 1, i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[TMP0]], i64 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vinsgr2vr_d(v2i64 _1) { return __lsx_vinsgr2vr_d(_1, 1, 1); }
 // CHECK-LABEL: @vfadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfadd_s(v4f32 _1, v4f32 _2) { return __lsx_vfadd_s(_1, _2); }
 // CHECK-LABEL: @vfadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfadd_d(v2f64 _1, v2f64 _2) { return __lsx_vfadd_d(_1, _2); }
 // CHECK-LABEL: @vfsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfsub_s(v4f32 _1, v4f32 _2) { return __lsx_vfsub_s(_1, _2); }
 // CHECK-LABEL: @vfsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfsub_d(v2f64 _1, v2f64 _2) { return __lsx_vfsub_d(_1, _2); }
 // CHECK-LABEL: @vfmul_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmul_s(v4f32 _1, v4f32 _2) { return __lsx_vfmul_s(_1, _2); }
 // CHECK-LABEL: @vfmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmul_d(v2f64 _1, v2f64 _2) { return __lsx_vfmul_d(_1, _2); }
 // CHECK-LABEL: @vfdiv_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfdiv_s(v4f32 _1, v4f32 _2) { return __lsx_vfdiv_s(_1, _2); }
 // CHECK-LABEL: @vfdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfdiv_d(v2f64 _1, v2f64 _2) { return __lsx_vfdiv_d(_1, _2); }
 // CHECK-LABEL: @vfcvt_h_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vfcvt_h_s(v4f32 _1, v4f32 _2) { return __lsx_vfcvt_h_s(_1, _2); }
 // CHECK-LABEL: @vfcvt_s_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfcvt_s_d(v2f64 _1, v2f64 _2) { return __lsx_vfcvt_s_d(_1, _2); }
 // CHECK-LABEL: @vfmin_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmin_s(v4f32 _1, v4f32 _2) { return __lsx_vfmin_s(_1, _2); }
 // CHECK-LABEL: @vfmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmin_d(v2f64 _1, v2f64 _2) { return __lsx_vfmin_d(_1, _2); }
 // CHECK-LABEL: @vfmina_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmina_s(v4f32 _1, v4f32 _2) { return __lsx_vfmina_s(_1, _2); }
 // CHECK-LABEL: @vfmina_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmina_d(v2f64 _1, v2f64 _2) { return __lsx_vfmina_d(_1, _2); }
 // CHECK-LABEL: @vfmax_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmax_s(v4f32 _1, v4f32 _2) { return __lsx_vfmax_s(_1, _2); }
 // CHECK-LABEL: @vfmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmax_d(v2f64 _1, v2f64 _2) { return __lsx_vfmax_d(_1, _2); }
 // CHECK-LABEL: @vfmaxa_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmaxa_s(v4f32 _1, v4f32 _2) { return __lsx_vfmaxa_s(_1, _2); }
 // CHECK-LABEL: @vfmaxa_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmaxa_d(v2f64 _1, v2f64 _2) { return __lsx_vfmaxa_d(_1, _2); }
 // CHECK-LABEL: @vfclass_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfclass_s(v4f32 _1) { return __lsx_vfclass_s(_1); }
 // CHECK-LABEL: @vfclass_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfclass_d(v2f64 _1) { return __lsx_vfclass_d(_1); }
 // CHECK-LABEL: @vfsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfsqrt_s(v4f32 _1) { return __lsx_vfsqrt_s(_1); }
 // CHECK-LABEL: @vfsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfsqrt_d(v2f64 _1) { return __lsx_vfsqrt_d(_1); }
 // CHECK-LABEL: @vfrecip_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrecip_s(v4f32 _1) { return __lsx_vfrecip_s(_1); }
 // CHECK-LABEL: @vfrecip_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrecip_d(v2f64 _1) { return __lsx_vfrecip_d(_1); }
 // CHECK-LABEL: @vfrint_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrint_s(v4f32 _1) { return __lsx_vfrint_s(_1); }
 // CHECK-LABEL: @vfrint_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrint_d(v2f64 _1) { return __lsx_vfrint_d(_1); }
 // CHECK-LABEL: @vfrsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrsqrt_s(v4f32 _1) { return __lsx_vfrsqrt_s(_1); }
 // CHECK-LABEL: @vfrsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrsqrt_d(v2f64 _1) { return __lsx_vfrsqrt_d(_1); }
 // CHECK-LABEL: @vflogb_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vflogb_s(v4f32 _1) { return __lsx_vflogb_s(_1); }
 // CHECK-LABEL: @vflogb_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vflogb_d(v2f64 _1) { return __lsx_vflogb_d(_1); }
 // CHECK-LABEL: @vfcvth_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfcvth_s_h(v8i16 _1) { return __lsx_vfcvth_s_h(_1); }
 // CHECK-LABEL: @vfcvth_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfcvth_d_s(v4f32 _1) { return __lsx_vfcvth_d_s(_1); }
 // CHECK-LABEL: @vfcvtl_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfcvtl_s_h(v8i16 _1) { return __lsx_vfcvtl_s_h(_1); }
 // CHECK-LABEL: @vfcvtl_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfcvtl_d_s(v4f32 _1) { return __lsx_vfcvtl_d_s(_1); }
 // CHECK-LABEL: @vftint_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftint_w_s(v4f32 _1) { return __lsx_vftint_w_s(_1); }
 // CHECK-LABEL: @vftint_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftint_l_d(v2f64 _1) { return __lsx_vftint_l_d(_1); }
 // CHECK-LABEL: @vftint_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vftint_wu_s(v4f32 _1) { return __lsx_vftint_wu_s(_1); }
 // CHECK-LABEL: @vftint_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vftint_lu_d(v2f64 _1) { return __lsx_vftint_lu_d(_1); }
 // CHECK-LABEL: @vftintrz_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrz_w_s(v4f32 _1) { return __lsx_vftintrz_w_s(_1); }
 // CHECK-LABEL: @vftintrz_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrz_l_d(v2f64 _1) { return __lsx_vftintrz_l_d(_1); }
 // CHECK-LABEL: @vftintrz_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vftintrz_wu_s(v4f32 _1) { return __lsx_vftintrz_wu_s(_1); }
 // CHECK-LABEL: @vftintrz_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vftintrz_lu_d(v2f64 _1) { return __lsx_vftintrz_lu_d(_1); }
 // CHECK-LABEL: @vffint_s_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vffint_s_w(v4i32 _1) { return __lsx_vffint_s_w(_1); }
 // CHECK-LABEL: @vffint_d_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffint_d_l(v2i64 _1) { return __lsx_vffint_d_l(_1); }
 // CHECK-LABEL: @vffint_s_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vffint_s_wu(v4u32 _1) { return __lsx_vffint_s_wu(_1); }
 // CHECK-LABEL: @vffint_d_lu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffint_d_lu(v2u64 _1) { return __lsx_vffint_d_lu(_1); }
 // CHECK-LABEL: @vandn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vandn_v(v16u8 _1, v16u8 _2) { return __lsx_vandn_v(_1, _2); }
 // CHECK-LABEL: @vneg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vneg_b(v16i8 _1) { return __lsx_vneg_b(_1); }
 // CHECK-LABEL: @vneg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vneg_h(v8i16 _1) { return __lsx_vneg_h(_1); }
 // CHECK-LABEL: @vneg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vneg_w(v4i32 _1) { return __lsx_vneg_w(_1); }
 // CHECK-LABEL: @vneg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vneg_d(v2i64 _1) { return __lsx_vneg_d(_1); }
 // CHECK-LABEL: @vmuh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmuh_b(v16i8 _1, v16i8 _2) { return __lsx_vmuh_b(_1, _2); }
 // CHECK-LABEL: @vmuh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmuh_h(v8i16 _1, v8i16 _2) { return __lsx_vmuh_h(_1, _2); }
 // CHECK-LABEL: @vmuh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmuh_w(v4i32 _1, v4i32 _2) { return __lsx_vmuh_w(_1, _2); }
 // CHECK-LABEL: @vmuh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmuh_d(v2i64 _1, v2i64 _2) { return __lsx_vmuh_d(_1, _2); }
 // CHECK-LABEL: @vmuh_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmuh_bu(v16u8 _1, v16u8 _2) { return __lsx_vmuh_bu(_1, _2); }
 // CHECK-LABEL: @vmuh_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmuh_hu(v8u16 _1, v8u16 _2) { return __lsx_vmuh_hu(_1, _2); }
 // CHECK-LABEL: @vmuh_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmuh_wu(v4u32 _1, v4u32 _2) { return __lsx_vmuh_wu(_1, _2); }
 // CHECK-LABEL: @vmuh_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmuh_du(v2u64 _1, v2u64 _2) { return __lsx_vmuh_du(_1, _2); }
 // CHECK-LABEL: @vsllwil_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsllwil_h_b(v16i8 _1) { return __lsx_vsllwil_h_b(_1, 1); }
 // CHECK-LABEL: @vsllwil_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsllwil_w_h(v8i16 _1) { return __lsx_vsllwil_w_h(_1, 1); }
 // CHECK-LABEL: @vsllwil_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsllwil_d_w(v4i32 _1) { return __lsx_vsllwil_d_w(_1, 1); }
 // CHECK-LABEL: @vsllwil_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vsllwil_hu_bu(v16u8 _1) { return __lsx_vsllwil_hu_bu(_1, 1); }
 // CHECK-LABEL: @vsllwil_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vsllwil_wu_hu(v8u16 _1) { return __lsx_vsllwil_wu_hu(_1, 1); }
 // CHECK-LABEL: @vsllwil_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vsllwil_du_wu(v4u32 _1) { return __lsx_vsllwil_du_wu(_1, 1); }
 // CHECK-LABEL: @vsran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsran_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsran_b_h(_1, _2); }
 // CHECK-LABEL: @vsran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsran_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsran_h_w(_1, _2); }
 // CHECK-LABEL: @vsran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsran_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsran_w_d(_1, _2); }
 // CHECK-LABEL: @vssran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssran_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssran_b_h(_1, _2); }
 // CHECK-LABEL: @vssran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssran_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssran_h_w(_1, _2); }
 // CHECK-LABEL: @vssran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssran_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssran_w_d(_1, _2); }
 // CHECK-LABEL: @vssran_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssran_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssran_bu_h(_1, _2); }
 // CHECK-LABEL: @vssran_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssran_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssran_hu_w(_1, _2); }
 // CHECK-LABEL: @vssran_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssran_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssran_wu_d(_1, _2); }
 // CHECK-LABEL: @vsrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrarn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrarn_b_h(_1, _2); }
 // CHECK-LABEL: @vsrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrarn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrarn_h_w(_1, _2); }
 // CHECK-LABEL: @vsrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrarn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrarn_w_d(_1, _2); }
 // CHECK-LABEL: @vssrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrarn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrarn_b_h(_1, _2); }
 // CHECK-LABEL: @vssrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrarn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrarn_h_w(_1, _2); }
 // CHECK-LABEL: @vssrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrarn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrarn_w_d(_1, _2); }
 // CHECK-LABEL: @vssrarn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrarn_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrarn_bu_h(_1, _2); }
 // CHECK-LABEL: @vssrarn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrarn_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrarn_hu_w(_1, _2); }
 // CHECK-LABEL: @vssrarn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrarn_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrarn_wu_d(_1, _2); }
 // CHECK-LABEL: @vsrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrln_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrln_b_h(_1, _2); }
 // CHECK-LABEL: @vsrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrln_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrln_h_w(_1, _2); }
 // CHECK-LABEL: @vsrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrln_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrln_w_d(_1, _2); }
 // CHECK-LABEL: @vssrln_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrln_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrln_bu_h(_1, _2); }
 // CHECK-LABEL: @vssrln_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrln_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrln_hu_w(_1, _2); }
 // CHECK-LABEL: @vssrln_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrln_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrln_wu_d(_1, _2); }
 // CHECK-LABEL: @vsrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlrn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @vsrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlrn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @vsrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlrn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @vssrlrn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlrn_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrlrn_bu_h(_1, _2); }
 // CHECK-LABEL: @vssrlrn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlrn_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrlrn_hu_w(_1, _2); }
 // CHECK-LABEL: @vssrlrn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlrn_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrlrn_wu_d(_1, _2); }
 // CHECK-LABEL: @vfrstpi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vfrstpi_b(v16i8 _1, v16i8 _2) { return __lsx_vfrstpi_b(_1, _2, 1); }
 // CHECK-LABEL: @vfrstpi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vfrstpi_h(v8i16 _1, v8i16 _2) { return __lsx_vfrstpi_h(_1, _2, 1); }
 // CHECK-LABEL: @vfrstp_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vfrstp_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vfrstp_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vfrstp_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vfrstp_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vfrstp_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf4i_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vshuf4i_d(v2i64 _1, v2i64 _2) { return __lsx_vshuf4i_d(_1, _2, 1); }
 // CHECK-LABEL: @vbsrl_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vbsrl_v(v16i8 _1) { return __lsx_vbsrl_v(_1, 1); }
 // CHECK-LABEL: @vbsll_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vbsll_v(v16i8 _1) { return __lsx_vbsll_v(_1, 1); }
 // CHECK-LABEL: @vextrins_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vextrins_b(v16i8 _1, v16i8 _2) { return __lsx_vextrins_b(_1, _2, 1); }
 // CHECK-LABEL: @vextrins_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vextrins_h(v8i16 _1, v8i16 _2) { return __lsx_vextrins_h(_1, _2, 1); }
 // CHECK-LABEL: @vextrins_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vextrins_w(v4i32 _1, v4i32 _2) { return __lsx_vextrins_w(_1, _2, 1); }
 // CHECK-LABEL: @vextrins_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vextrins_d(v2i64 _1, v2i64 _2) { return __lsx_vextrins_d(_1, _2, 1); }
 // CHECK-LABEL: @vmskltz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmskltz_b(v16i8 _1) { return __lsx_vmskltz_b(_1); }
 // CHECK-LABEL: @vmskltz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmskltz_h(v8i16 _1) { return __lsx_vmskltz_h(_1); }
 // CHECK-LABEL: @vmskltz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmskltz_w(v4i32 _1) { return __lsx_vmskltz_w(_1); }
 // CHECK-LABEL: @vmskltz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmskltz_d(v2i64 _1) { return __lsx_vmskltz_d(_1); }
 // CHECK-LABEL: @vsigncov_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsigncov_b(v16i8 _1, v16i8 _2) { return __lsx_vsigncov_b(_1, _2); }
 // CHECK-LABEL: @vsigncov_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsigncov_h(v8i16 _1, v8i16 _2) { return __lsx_vsigncov_h(_1, _2); }
 // CHECK-LABEL: @vsigncov_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsigncov_w(v4i32 _1, v4i32 _2) { return __lsx_vsigncov_w(_1, _2); }
 // CHECK-LABEL: @vsigncov_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsigncov_d(v2i64 _1, v2i64 _2) { return __lsx_vsigncov_d(_1, _2); }
 // CHECK-LABEL: @vfmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __lsx_vfmadd_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __lsx_vfmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __lsx_vfmsub_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __lsx_vfmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfnmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __lsx_vfnmadd_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfnmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __lsx_vfnmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfnmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __lsx_vfnmsub_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfnmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __lsx_vfnmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vftintrne_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrne_w_s(v4f32 _1) { return __lsx_vftintrne_w_s(_1); }
 // CHECK-LABEL: @vftintrne_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrne_l_d(v2f64 _1) { return __lsx_vftintrne_l_d(_1); }
 // CHECK-LABEL: @vftintrp_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrp_w_s(v4f32 _1) { return __lsx_vftintrp_w_s(_1); }
 // CHECK-LABEL: @vftintrp_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrp_l_d(v2f64 _1) { return __lsx_vftintrp_l_d(_1); }
 // CHECK-LABEL: @vftintrm_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrm_w_s(v4f32 _1) { return __lsx_vftintrm_w_s(_1); }
 // CHECK-LABEL: @vftintrm_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrm_l_d(v2f64 _1) { return __lsx_vftintrm_l_d(_1); }
 // CHECK-LABEL: @vftint_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftint_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftint_w_d(_1, _2); }
 // CHECK-LABEL: @vffint_s_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vffint_s_l(v2i64 _1, v2i64 _2) { return __lsx_vffint_s_l(_1, _2); }
 // CHECK-LABEL: @vftintrz_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrz_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrz_w_d(_1, _2); }
 // CHECK-LABEL: @vftintrp_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrp_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrp_w_d(_1, _2); }
 // CHECK-LABEL: @vftintrm_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrm_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrm_w_d(_1, _2); }
 // CHECK-LABEL: @vftintrne_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrne_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrne_w_d(_1, _2); }
 // CHECK-LABEL: @vftintl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintl_l_s(v4f32 _1) { return __lsx_vftintl_l_s(_1); }
 // CHECK-LABEL: @vftinth_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftinth_l_s(v4f32 _1) { return __lsx_vftinth_l_s(_1); }
 // CHECK-LABEL: @vffinth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffinth_d_w(v4i32 _1) { return __lsx_vffinth_d_w(_1); }
 // CHECK-LABEL: @vffintl_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffintl_d_w(v4i32 _1) { return __lsx_vffintl_d_w(_1); }
 // CHECK-LABEL: @vftintrzl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrzl_l_s(v4f32 _1) { return __lsx_vftintrzl_l_s(_1); }
 // CHECK-LABEL: @vftintrzh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrzh_l_s(v4f32 _1) { return __lsx_vftintrzh_l_s(_1); }
 // CHECK-LABEL: @vftintrpl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrpl_l_s(v4f32 _1) { return __lsx_vftintrpl_l_s(_1); }
 // CHECK-LABEL: @vftintrph_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrph_l_s(v4f32 _1) { return __lsx_vftintrph_l_s(_1); }
 // CHECK-LABEL: @vftintrml_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrml_l_s(v4f32 _1) { return __lsx_vftintrml_l_s(_1); }
 // CHECK-LABEL: @vftintrmh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrmh_l_s(v4f32 _1) { return __lsx_vftintrmh_l_s(_1); }
 // CHECK-LABEL: @vftintrnel_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrnel_l_s(v4f32 _1) { return __lsx_vftintrnel_l_s(_1); }
 // CHECK-LABEL: @vftintrneh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrneh_l_s(v4f32 _1) { return __lsx_vftintrneh_l_s(_1); }
 // CHECK-LABEL: @vfrintrne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrne_s(v4f32 _1) { return __lsx_vfrintrne_s(_1); }
 // CHECK-LABEL: @vfrintrne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrne_d(v2f64 _1) { return __lsx_vfrintrne_d(_1); }
 // CHECK-LABEL: @vfrintrz_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrz_s(v4f32 _1) { return __lsx_vfrintrz_s(_1); }
 // CHECK-LABEL: @vfrintrz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrz_d(v2f64 _1) { return __lsx_vfrintrz_d(_1); }
 // CHECK-LABEL: @vfrintrp_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrp_s(v4f32 _1) { return __lsx_vfrintrp_s(_1); }
 // CHECK-LABEL: @vfrintrp_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrp_d(v2f64 _1) { return __lsx_vfrintrp_d(_1); }
 // CHECK-LABEL: @vfrintrm_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrm_s(v4f32 _1) { return __lsx_vfrintrm_s(_1); }
 // CHECK-LABEL: @vfrintrm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrm_d(v2f64 _1) { return __lsx_vfrintrm_d(_1); }
 // CHECK-LABEL: @vstelm_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_b(v16i8 _1, void *_2) { return __lsx_vstelm_b(_1, _2, 1, 1); }
 // CHECK-LABEL: @vstelm_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[TMP0]], ptr [[_2:%.*]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_h(v8i16 _1, void *_2) { return __lsx_vstelm_h(_1, _2, 2, 1); }
 // CHECK-LABEL: @vstelm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[TMP0]], ptr [[_2:%.*]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_w(v4i32 _1, void *_2) { return __lsx_vstelm_w(_1, _2, 4, 1); }
 // CHECK-LABEL: @vstelm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[TMP0]], ptr [[_2:%.*]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_d(v2i64 _1, void *_2) { return __lsx_vstelm_d(_1, _2, 8, 1); }
 // CHECK-LABEL: @vaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vaddwev_d_w(_1, _2); }
 // CHECK-LABEL: @vaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vaddwev_w_h(_1, _2); }
 // CHECK-LABEL: @vaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vaddwev_h_b(_1, _2); }
 // CHECK-LABEL: @vaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vaddwod_d_w(_1, _2); }
 // CHECK-LABEL: @vaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vaddwod_w_h(_1, _2); }
 // CHECK-LABEL: @vaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vaddwod_h_b(_1, _2); }
 // CHECK-LABEL: @vaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vaddwev_d_wu(_1, _2); }
 // CHECK-LABEL: @vaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vaddwev_w_hu(_1, _2); }
 // CHECK-LABEL: @vaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vaddwev_h_bu(_1, _2); }
 // CHECK-LABEL: @vaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vaddwod_d_wu(_1, _2); }
 // CHECK-LABEL: @vaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vaddwod_w_hu(_1, _2); }
 // CHECK-LABEL: @vaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vaddwod_h_bu(_1, _2); }
 // CHECK-LABEL: @vaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_wu_w(v4u32 _1, v4i32 _2) {
   return __lsx_vaddwev_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_hu_h(v8u16 _1, v8i16 _2) {
   return __lsx_vaddwev_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_bu_b(v16u8 _1, v16i8 _2) {
   return __lsx_vaddwev_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_wu_w(v4u32 _1, v4i32 _2) {
   return __lsx_vaddwod_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_hu_h(v8u16 _1, v8i16 _2) {
   return __lsx_vaddwod_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_bu_b(v16u8 _1, v16i8 _2) {
   return __lsx_vaddwod_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vsubwev_d_w(_1, _2); }
 // CHECK-LABEL: @vsubwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vsubwev_w_h(_1, _2); }
 // CHECK-LABEL: @vsubwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vsubwev_h_b(_1, _2); }
 // CHECK-LABEL: @vsubwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vsubwod_d_w(_1, _2); }
 // CHECK-LABEL: @vsubwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vsubwod_w_h(_1, _2); }
 // CHECK-LABEL: @vsubwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vsubwod_h_b(_1, _2); }
 // CHECK-LABEL: @vsubwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vsubwev_d_wu(_1, _2); }
 // CHECK-LABEL: @vsubwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vsubwev_w_hu(_1, _2); }
 // CHECK-LABEL: @vsubwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vsubwev_h_bu(_1, _2); }
 // CHECK-LABEL: @vsubwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vsubwod_d_wu(_1, _2); }
 // CHECK-LABEL: @vsubwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vsubwod_w_hu(_1, _2); }
 // CHECK-LABEL: @vsubwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vsubwod_h_bu(_1, _2); }
 // CHECK-LABEL: @vaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vaddwev_q_d(_1, _2); }
 // CHECK-LABEL: @vaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vaddwod_q_d(_1, _2); }
 // CHECK-LABEL: @vaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vaddwev_q_du(_1, _2); }
 // CHECK-LABEL: @vaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vaddwod_q_du(_1, _2); }
 // CHECK-LABEL: @vsubwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vsubwev_q_d(_1, _2); }
 // CHECK-LABEL: @vsubwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vsubwod_q_d(_1, _2); }
 // CHECK-LABEL: @vsubwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vsubwev_q_du(_1, _2); }
 // CHECK-LABEL: @vsubwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vsubwod_q_du(_1, _2); }
 // CHECK-LABEL: @vaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_du_d(v2u64 _1, v2i64 _2) {
   return __lsx_vaddwev_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_du_d(v2u64 _1, v2i64 _2) {
   return __lsx_vaddwod_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vmulwev_d_w(_1, _2); }
 // CHECK-LABEL: @vmulwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vmulwev_w_h(_1, _2); }
 // CHECK-LABEL: @vmulwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vmulwev_h_b(_1, _2); }
 // CHECK-LABEL: @vmulwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vmulwod_d_w(_1, _2); }
 // CHECK-LABEL: @vmulwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vmulwod_w_h(_1, _2); }
 // CHECK-LABEL: @vmulwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vmulwod_h_b(_1, _2); }
 // CHECK-LABEL: @vmulwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vmulwev_d_wu(_1, _2); }
 // CHECK-LABEL: @vmulwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vmulwev_w_hu(_1, _2); }
 // CHECK-LABEL: @vmulwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vmulwev_h_bu(_1, _2); }
 // CHECK-LABEL: @vmulwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vmulwod_d_wu(_1, _2); }
 // CHECK-LABEL: @vmulwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vmulwod_w_hu(_1, _2); }
 // CHECK-LABEL: @vmulwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vmulwod_h_bu(_1, _2); }
 // CHECK-LABEL: @vmulwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_wu_w(v4u32 _1, v4i32 _2) {
   return __lsx_vmulwev_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_hu_h(v8u16 _1, v8i16 _2) {
   return __lsx_vmulwev_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_bu_b(v16u8 _1, v16i8 _2) {
   return __lsx_vmulwev_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_wu_w(v4u32 _1, v4i32 _2) {
   return __lsx_vmulwod_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_hu_h(v8u16 _1, v8i16 _2) {
   return __lsx_vmulwod_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_bu_b(v16u8 _1, v16i8 _2) {
   return __lsx_vmulwod_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vmulwev_q_d(_1, _2); }
 // CHECK-LABEL: @vmulwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vmulwod_q_d(_1, _2); }
 // CHECK-LABEL: @vmulwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vmulwev_q_du(_1, _2); }
 // CHECK-LABEL: @vmulwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vmulwod_q_du(_1, _2); }
 // CHECK-LABEL: @vmulwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_du_d(v2u64 _1, v2i64 _2) {
   return __lsx_vmulwev_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_du_d(v2u64 _1, v2i64 _2) {
   return __lsx_vmulwod_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhaddw_q_d(v2i64 _1, v2i64 _2) { return __lsx_vhaddw_q_d(_1, _2); }
 // CHECK-LABEL: @vhaddw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhaddw_qu_du(v2u64 _1, v2u64 _2) { return __lsx_vhaddw_qu_du(_1, _2); }
 // CHECK-LABEL: @vhsubw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_q_d(v2i64 _1, v2i64 _2) { return __lsx_vhsubw_q_d(_1, _2); }
 // CHECK-LABEL: @vhsubw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhsubw_qu_du(v2u64 _1, v2u64 _2) { return __lsx_vhsubw_qu_du(_1, _2); }
 // CHECK-LABEL: @vmaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vmaddwev_d_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwev_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vmaddwev_w_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwev_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vmaddwev_h_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwev_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
   return __lsx_vmaddwev_d_wu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4u32 vmaddwev_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
   return __lsx_vmaddwev_w_hu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8u16 vmaddwev_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
   return __lsx_vmaddwev_h_bu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vmaddwod_d_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwod_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vmaddwod_w_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwod_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vmaddwod_h_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwod_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
   return __lsx_vmaddwod_d_wu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4u32 vmaddwod_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
   return __lsx_vmaddwod_w_hu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8u16 vmaddwod_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
   return __lsx_vmaddwod_h_bu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
   return __lsx_vmaddwev_d_wu_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwev_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
   return __lsx_vmaddwev_w_hu_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwev_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
   return __lsx_vmaddwev_h_bu_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
   return __lsx_vmaddwod_d_wu_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwod_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
   return __lsx_vmaddwod_w_hu_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwod_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
   return __lsx_vmaddwod_h_bu_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vmaddwev_q_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vmaddwod_q_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwev_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
   return __lsx_vmaddwev_q_du(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwod_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
   return __lsx_vmaddwod_q_du(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
   return __lsx_vmaddwev_q_du_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
   return __lsx_vmaddwod_q_du_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vrotr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vrotr_b(v16i8 _1, v16i8 _2) { return __lsx_vrotr_b(_1, _2); }
 // CHECK-LABEL: @vrotr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vrotr_h(v8i16 _1, v8i16 _2) { return __lsx_vrotr_h(_1, _2); }
 // CHECK-LABEL: @vrotr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vrotr_w(v4i32 _1, v4i32 _2) { return __lsx_vrotr_w(_1, _2); }
 // CHECK-LABEL: @vrotr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vrotr_d(v2i64 _1, v2i64 _2) { return __lsx_vrotr_d(_1, _2); }
 // CHECK-LABEL: @vadd_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadd_q(v2i64 _1, v2i64 _2) { return __lsx_vadd_q(_1, _2); }
 // CHECK-LABEL: @vsub_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsub_q(v2i64 _1, v2i64 _2) { return __lsx_vsub_q(_1, _2); }
 // CHECK-LABEL: @vldrepl_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vldrepl_b(void *_1) { return __lsx_vldrepl_b(_1, 1); }
 // CHECK-LABEL: @vldrepl_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vldrepl_h(void *_1) { return __lsx_vldrepl_h(_1, 2); }
 // CHECK-LABEL: @vldrepl_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vldrepl_w(void *_1) { return __lsx_vldrepl_w(_1, 4); }
 // CHECK-LABEL: @vldrepl_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vldrepl_d(void *_1) { return __lsx_vldrepl_d(_1, 8); }
 // CHECK-LABEL: @vmskgez_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmskgez_b(v16i8 _1) { return __lsx_vmskgez_b(_1); }
 // CHECK-LABEL: @vmsknz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmsknz_b(v16i8 _1) { return __lsx_vmsknz_b(_1); }
 // CHECK-LABEL: @vexth_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vexth_h_b(v16i8 _1) { return __lsx_vexth_h_b(_1); }
 // CHECK-LABEL: @vexth_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vexth_w_h(v8i16 _1) { return __lsx_vexth_w_h(_1); }
 // CHECK-LABEL: @vexth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vexth_d_w(v4i32 _1) { return __lsx_vexth_d_w(_1); }
 // CHECK-LABEL: @vexth_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vexth_q_d(v2i64 _1) { return __lsx_vexth_q_d(_1); }
 // CHECK-LABEL: @vexth_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vexth_hu_bu(v16u8 _1) { return __lsx_vexth_hu_bu(_1); }
 // CHECK-LABEL: @vexth_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vexth_wu_hu(v8u16 _1) { return __lsx_vexth_wu_hu(_1); }
 // CHECK-LABEL: @vexth_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vexth_du_wu(v4u32 _1) { return __lsx_vexth_du_wu(_1); }
 // CHECK-LABEL: @vexth_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vexth_qu_du(v2u64 _1) { return __lsx_vexth_qu_du(_1); }
 // CHECK-LABEL: @vrotri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vrotri_b(v16i8 _1) { return __lsx_vrotri_b(_1, 1); }
 // CHECK-LABEL: @vrotri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vrotri_h(v8i16 _1) { return __lsx_vrotri_h(_1, 1); }
 // CHECK-LABEL: @vrotri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vrotri_w(v4i32 _1) { return __lsx_vrotri_w(_1, 1); }
 // CHECK-LABEL: @vrotri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vrotri_d(v2i64 _1) { return __lsx_vrotri_d(_1, 1); }
 // CHECK-LABEL: @vextl_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vextl_q_d(v2i64 _1) { return __lsx_vextl_q_d(_1); }
 // CHECK-LABEL: @vsrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vsrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vsrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vsrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vsrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlrni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vsrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlrni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vsrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlrni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vsrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlrni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrlni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlni_bu_h(v16u8 _1, v16i8 _2) { return __lsx_vssrlni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlni_hu_w(v8u16 _1, v8i16 _2) { return __lsx_vssrlni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlni_wu_d(v4u32 _1, v4i32 _2) { return __lsx_vssrlni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrlni_du_q(v2u64 _1, v2i64 _2) { return __lsx_vssrlni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlrni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlrni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlrni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrlrni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlrni_bu_h(v16u8 _1, v16i8 _2) {
   return __lsx_vssrlrni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlrni_hu_w(v8u16 _1, v8i16 _2) {
   return __lsx_vssrlrni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlrni_wu_d(v4u32 _1, v4i32 _2) {
   return __lsx_vssrlrni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrlrni_du_q(v2u64 _1, v2i64 _2) {
   return __lsx_vssrlrni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrani_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vsrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrani_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vsrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrani_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vsrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrani_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vsrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrarni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vsrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrarni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vsrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrarni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vsrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrarni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrani_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrani_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrani_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrani_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrani_bu_h(v16u8 _1, v16i8 _2) { return __lsx_vssrani_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrani_hu_w(v8u16 _1, v8i16 _2) { return __lsx_vssrani_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrani_wu_d(v4u32 _1, v4i32 _2) { return __lsx_vssrani_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrani_du_q(v2u64 _1, v2i64 _2) { return __lsx_vssrani_du_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrarni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrarni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrarni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrarni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrarni_bu_h(v16u8 _1, v16i8 _2) {
   return __lsx_vssrarni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrarni_hu_w(v8u16 _1, v8i16 _2) {
   return __lsx_vssrarni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrarni_wu_d(v4u32 _1, v4i32 _2) {
   return __lsx_vssrarni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrarni_du_q(v2u64 _1, v2i64 _2) {
   return __lsx_vssrarni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vpermi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpermi_w(v4i32 _1, v4i32 _2) { return __lsx_vpermi_w(_1, _2, 1); }
 // CHECK-LABEL: @vld(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vld(void *_1) { return __lsx_vld(_1, 1); }
 // CHECK-LABEL: @vst(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void vst(v16i8 _1, void *_2) { return __lsx_vst(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlrn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @vssrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlrn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @vssrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlrn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @vssrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrln_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrln_b_h(_1, _2); }
 // CHECK-LABEL: @vssrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrln_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrln_h_w(_1, _2); }
 // CHECK-LABEL: @vssrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrln_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrln_w_d(_1, _2); }
 // CHECK-LABEL: @vorn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vorn_v(v16i8 _1, v16i8 _2) { return __lsx_vorn_v(_1, _2); }
 // CHECK-LABEL: @vldi(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldi(i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vldi() { return __lsx_vldi(1); }
 // CHECK-LABEL: @vshuf_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vshuf_b(_1, _2, _3);
@@ -4086,366 +5844,516 @@ v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
 // CHECK-LABEL: @vldx(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vldx(void *_1) { return __lsx_vldx(_1, 1); }
 // CHECK-LABEL: @vstx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void vstx(v16i8 _1, void *_2) { return __lsx_vstx(_1, _2, 1); }
 // CHECK-LABEL: @vextl_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vextl_qu_du(v2u64 _1) { return __lsx_vextl_qu_du(_1); }
 // CHECK-LABEL: @bnz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_b(v16u8 _1) { return __lsx_bnz_b(_1); }
 // CHECK-LABEL: @bnz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_d(v2u64 _1) { return __lsx_bnz_d(_1); }
 // CHECK-LABEL: @bnz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_h(v8u16 _1) { return __lsx_bnz_h(_1); }
 // CHECK-LABEL: @bnz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_v(v16u8 _1) { return __lsx_bnz_v(_1); }
 // CHECK-LABEL: @bnz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_w(v4u32 _1) { return __lsx_bnz_w(_1); }
 // CHECK-LABEL: @bz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_b(v16u8 _1) { return __lsx_bz_b(_1); }
 // CHECK-LABEL: @bz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_d(v2u64 _1) { return __lsx_bz_d(_1); }
 // CHECK-LABEL: @bz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_h(v8u16 _1) { return __lsx_bz_h(_1); }
 // CHECK-LABEL: @bz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_v(v16u8 _1) { return __lsx_bz_v(_1); }
 // CHECK-LABEL: @bz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_w(v4u32 _1) { return __lsx_bz_w(_1); }
 // CHECK-LABEL: @vfcmp_caf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_caf_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_caf_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_caf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_caf_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_caf_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_ceq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_ceq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_ceq_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_ceq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_ceq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_ceq_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cle_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cle_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cle_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cle_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_clt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_clt_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_clt_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_clt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_clt_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_clt_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cne_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cne_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cne_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cne_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cor_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cor_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cor_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cor_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cueq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cueq_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cueq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cueq_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cule_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cule_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cule_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cule_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cult_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cult_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cult_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cult_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cun_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cun_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cune_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cune_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cune_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cune_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cun_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cun_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_saf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_saf_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_saf_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_saf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_saf_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_saf_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_seq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_seq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_seq_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_seq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_seq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_seq_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sle_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sle_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sle_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sle_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_slt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_slt_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_slt_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_slt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_slt_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_slt_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sne_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sne_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sne_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sne_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sor_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sor_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sor_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sor_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sueq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sueq_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sueq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sueq_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sule_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sule_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sule_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sule_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sult_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sult_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sult_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sult_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sun_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sun_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sune_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sune_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sune_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sune_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sun_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sun_s(_1, _2); }
 // CHECK-LABEL: @vrepli_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vrepli_b() { return __lsx_vrepli_b(1); }
 // CHECK-LABEL: @vrepli_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vrepli_d() { return __lsx_vrepli_d(1); }
 // CHECK-LABEL: @vrepli_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vrepli_h() { return __lsx_vrepli_h(1); }
 // CHECK-LABEL: @vrepli_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vrepli_w() { return __lsx_vrepli_w(1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin.c b/clang/test/CodeGen/LoongArch/lsx/builtin.c
index ef5a390e1838c..05a3d13a7fb9a 100644
--- a/clang/test/CodeGen/LoongArch/lsx/builtin.c
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin.c
@@ -29,3319 +29,4547 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
 
 // CHECK-LABEL: @vsll_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsll_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsll_b(_1, _2); }
 // CHECK-LABEL: @vsll_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsll_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsll_h(_1, _2); }
 // CHECK-LABEL: @vsll_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsll_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsll_w(_1, _2); }
 // CHECK-LABEL: @vsll_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsll_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsll_d(_1, _2); }
 // CHECK-LABEL: @vslli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslli_b(v16i8 _1) { return __builtin_lsx_vslli_b(_1, 1); }
 // CHECK-LABEL: @vslli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslli_h(v8i16 _1) { return __builtin_lsx_vslli_h(_1, 1); }
 // CHECK-LABEL: @vslli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslli_w(v4i32 _1) { return __builtin_lsx_vslli_w(_1, 1); }
 // CHECK-LABEL: @vslli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslli_d(v2i64 _1) { return __builtin_lsx_vslli_d(_1, 1); }
 // CHECK-LABEL: @vsra_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsra_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsra_b(_1, _2); }
 // CHECK-LABEL: @vsra_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsra_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsra_h(_1, _2); }
 // CHECK-LABEL: @vsra_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsra_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsra_w(_1, _2); }
 // CHECK-LABEL: @vsra_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsra_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsra_d(_1, _2); }
 // CHECK-LABEL: @vsrai_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrai_b(v16i8 _1) { return __builtin_lsx_vsrai_b(_1, 1); }
 // CHECK-LABEL: @vsrai_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrai_h(v8i16 _1) { return __builtin_lsx_vsrai_h(_1, 1); }
 // CHECK-LABEL: @vsrai_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrai_w(v4i32 _1) { return __builtin_lsx_vsrai_w(_1, 1); }
 // CHECK-LABEL: @vsrai_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrai_d(v2i64 _1) { return __builtin_lsx_vsrai_d(_1, 1); }
 // CHECK-LABEL: @vsrar_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrar_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrar_b(_1, _2);
 }
 // CHECK-LABEL: @vsrar_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrar_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrar_h(_1, _2);
 }
 // CHECK-LABEL: @vsrar_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrar_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrar_w(_1, _2);
 }
 // CHECK-LABEL: @vsrar_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrar_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrar_d(_1, _2);
 }
 // CHECK-LABEL: @vsrari_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrari_b(v16i8 _1) { return __builtin_lsx_vsrari_b(_1, 1); }
 // CHECK-LABEL: @vsrari_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrari_h(v8i16 _1) { return __builtin_lsx_vsrari_h(_1, 1); }
 // CHECK-LABEL: @vsrari_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrari_w(v4i32 _1) { return __builtin_lsx_vsrari_w(_1, 1); }
 // CHECK-LABEL: @vsrari_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrari_d(v2i64 _1) { return __builtin_lsx_vsrari_d(_1, 1); }
 // CHECK-LABEL: @vsrl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrl_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsrl_b(_1, _2); }
 // CHECK-LABEL: @vsrl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrl_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsrl_h(_1, _2); }
 // CHECK-LABEL: @vsrl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrl_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsrl_w(_1, _2); }
 // CHECK-LABEL: @vsrl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrl_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsrl_d(_1, _2); }
 // CHECK-LABEL: @vsrli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrli_b(v16i8 _1) { return __builtin_lsx_vsrli_b(_1, 1); }
 // CHECK-LABEL: @vsrli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrli_h(v8i16 _1) { return __builtin_lsx_vsrli_h(_1, 1); }
 // CHECK-LABEL: @vsrli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrli_w(v4i32 _1) { return __builtin_lsx_vsrli_w(_1, 1); }
 // CHECK-LABEL: @vsrli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrli_d(v2i64 _1) { return __builtin_lsx_vsrli_d(_1, 1); }
 // CHECK-LABEL: @vsrlr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlr_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrlr_b(_1, _2);
 }
 // CHECK-LABEL: @vsrlr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlr_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrlr_h(_1, _2);
 }
 // CHECK-LABEL: @vsrlr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlr_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrlr_w(_1, _2);
 }
 // CHECK-LABEL: @vsrlr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlr_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrlr_d(_1, _2);
 }
 // CHECK-LABEL: @vsrlri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrlri_b(v16i8 _1) { return __builtin_lsx_vsrlri_b(_1, 1); }
 // CHECK-LABEL: @vsrlri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrlri_h(v8i16 _1) { return __builtin_lsx_vsrlri_h(_1, 1); }
 // CHECK-LABEL: @vsrlri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrlri_w(v4i32 _1) { return __builtin_lsx_vsrlri_w(_1, 1); }
 // CHECK-LABEL: @vsrlri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrlri_d(v2i64 _1) { return __builtin_lsx_vsrlri_d(_1, 1); }
 // CHECK-LABEL: @vbitclr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitclr_b(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vbitclr_b(_1, _2);
 }
 // CHECK-LABEL: @vbitclr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitclr_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vbitclr_h(_1, _2);
 }
 // CHECK-LABEL: @vbitclr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitclr_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vbitclr_w(_1, _2);
 }
 // CHECK-LABEL: @vbitclr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitclr_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vbitclr_d(_1, _2);
 }
 // CHECK-LABEL: @vbitclri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitclri_b(v16u8 _1) { return __builtin_lsx_vbitclri_b(_1, 1); }
 // CHECK-LABEL: @vbitclri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitclri_h(v8u16 _1) { return __builtin_lsx_vbitclri_h(_1, 1); }
 // CHECK-LABEL: @vbitclri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitclri_w(v4u32 _1) { return __builtin_lsx_vbitclri_w(_1, 1); }
 // CHECK-LABEL: @vbitclri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitclri_d(v2u64 _1) { return __builtin_lsx_vbitclri_d(_1, 1); }
 // CHECK-LABEL: @vbitset_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitset_b(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vbitset_b(_1, _2);
 }
 // CHECK-LABEL: @vbitset_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitset_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vbitset_h(_1, _2);
 }
 // CHECK-LABEL: @vbitset_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitset_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vbitset_w(_1, _2);
 }
 // CHECK-LABEL: @vbitset_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitset_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vbitset_d(_1, _2);
 }
 // CHECK-LABEL: @vbitseti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitseti_b(v16u8 _1) { return __builtin_lsx_vbitseti_b(_1, 1); }
 // CHECK-LABEL: @vbitseti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitseti_h(v8u16 _1) { return __builtin_lsx_vbitseti_h(_1, 1); }
 // CHECK-LABEL: @vbitseti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitseti_w(v4u32 _1) { return __builtin_lsx_vbitseti_w(_1, 1); }
 // CHECK-LABEL: @vbitseti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitseti_d(v2u64 _1) { return __builtin_lsx_vbitseti_d(_1, 1); }
 // CHECK-LABEL: @vbitrev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitrev_b(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vbitrev_b(_1, _2);
 }
 // CHECK-LABEL: @vbitrev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitrev_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vbitrev_h(_1, _2);
 }
 // CHECK-LABEL: @vbitrev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitrev_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vbitrev_w(_1, _2);
 }
 // CHECK-LABEL: @vbitrev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitrev_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vbitrev_d(_1, _2);
 }
 // CHECK-LABEL: @vbitrevi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitrevi_b(v16u8 _1) { return __builtin_lsx_vbitrevi_b(_1, 1); }
 // CHECK-LABEL: @vbitrevi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitrevi_h(v8u16 _1) { return __builtin_lsx_vbitrevi_h(_1, 1); }
 // CHECK-LABEL: @vbitrevi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitrevi_w(v4u32 _1) { return __builtin_lsx_vbitrevi_w(_1, 1); }
 // CHECK-LABEL: @vbitrevi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitrevi_d(v2u64 _1) { return __builtin_lsx_vbitrevi_d(_1, 1); }
 // CHECK-LABEL: @vadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vadd_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vadd_b(_1, _2); }
 // CHECK-LABEL: @vadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vadd_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vadd_h(_1, _2); }
 // CHECK-LABEL: @vadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vadd_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vadd_w(_1, _2); }
 // CHECK-LABEL: @vadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadd_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vadd_d(_1, _2); }
 // CHECK-LABEL: @vaddi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vaddi_bu(v16i8 _1) { return __builtin_lsx_vaddi_bu(_1, 1); }
 // CHECK-LABEL: @vaddi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vaddi_hu(v8i16 _1) { return __builtin_lsx_vaddi_hu(_1, 1); }
 // CHECK-LABEL: @vaddi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vaddi_wu(v4i32 _1) { return __builtin_lsx_vaddi_wu(_1, 1); }
 // CHECK-LABEL: @vaddi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vaddi_du(v2i64 _1) { return __builtin_lsx_vaddi_du(_1, 1); }
 // CHECK-LABEL: @vsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsub_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsub_b(_1, _2); }
 // CHECK-LABEL: @vsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsub_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsub_h(_1, _2); }
 // CHECK-LABEL: @vsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsub_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsub_w(_1, _2); }
 // CHECK-LABEL: @vsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsub_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsub_d(_1, _2); }
 // CHECK-LABEL: @vsubi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsubi_bu(v16i8 _1) { return __builtin_lsx_vsubi_bu(_1, 1); }
 // CHECK-LABEL: @vsubi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsubi_hu(v8i16 _1) { return __builtin_lsx_vsubi_hu(_1, 1); }
 // CHECK-LABEL: @vsubi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsubi_wu(v4i32 _1) { return __builtin_lsx_vsubi_wu(_1, 1); }
 // CHECK-LABEL: @vsubi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsubi_du(v2i64 _1) { return __builtin_lsx_vsubi_du(_1, 1); }
 // CHECK-LABEL: @vmax_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmax_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmax_b(_1, _2); }
 // CHECK-LABEL: @vmax_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmax_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmax_h(_1, _2); }
 // CHECK-LABEL: @vmax_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmax_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmax_w(_1, _2); }
 // CHECK-LABEL: @vmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmax_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmax_d(_1, _2); }
 // CHECK-LABEL: @vmaxi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmaxi_b(v16i8 _1) { return __builtin_lsx_vmaxi_b(_1, 1); }
 // CHECK-LABEL: @vmaxi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmaxi_h(v8i16 _1) { return __builtin_lsx_vmaxi_h(_1, 1); }
 // CHECK-LABEL: @vmaxi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmaxi_w(v4i32 _1) { return __builtin_lsx_vmaxi_w(_1, 1); }
 // CHECK-LABEL: @vmaxi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmaxi_d(v2i64 _1) { return __builtin_lsx_vmaxi_d(_1, 1); }
 // CHECK-LABEL: @vmax_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmax_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmax_bu(_1, _2);
 }
 // CHECK-LABEL: @vmax_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmax_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmax_hu(_1, _2);
 }
 // CHECK-LABEL: @vmax_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmax_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmax_wu(_1, _2);
 }
 // CHECK-LABEL: @vmax_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmax_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmax_du(_1, _2);
 }
 // CHECK-LABEL: @vmaxi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vmaxi_bu(v16u8 _1) { return __builtin_lsx_vmaxi_bu(_1, 1); }
 // CHECK-LABEL: @vmaxi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vmaxi_hu(v8u16 _1) { return __builtin_lsx_vmaxi_hu(_1, 1); }
 // CHECK-LABEL: @vmaxi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vmaxi_wu(v4u32 _1) { return __builtin_lsx_vmaxi_wu(_1, 1); }
 // CHECK-LABEL: @vmaxi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vmaxi_du(v2u64 _1) { return __builtin_lsx_vmaxi_du(_1, 1); }
 // CHECK-LABEL: @vmin_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmin_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmin_b(_1, _2); }
 // CHECK-LABEL: @vmin_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmin_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmin_h(_1, _2); }
 // CHECK-LABEL: @vmin_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmin_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmin_w(_1, _2); }
 // CHECK-LABEL: @vmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmin_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmin_d(_1, _2); }
 // CHECK-LABEL: @vmini_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmini_b(v16i8 _1) { return __builtin_lsx_vmini_b(_1, 1); }
 // CHECK-LABEL: @vmini_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmini_h(v8i16 _1) { return __builtin_lsx_vmini_h(_1, 1); }
 // CHECK-LABEL: @vmini_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmini_w(v4i32 _1) { return __builtin_lsx_vmini_w(_1, 1); }
 // CHECK-LABEL: @vmini_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmini_d(v2i64 _1) { return __builtin_lsx_vmini_d(_1, 1); }
 // CHECK-LABEL: @vmin_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmin_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmin_bu(_1, _2);
 }
 // CHECK-LABEL: @vmin_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmin_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmin_hu(_1, _2);
 }
 // CHECK-LABEL: @vmin_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmin_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmin_wu(_1, _2);
 }
 // CHECK-LABEL: @vmin_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmin_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmin_du(_1, _2);
 }
 // CHECK-LABEL: @vmini_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vmini_bu(v16u8 _1) { return __builtin_lsx_vmini_bu(_1, 1); }
 // CHECK-LABEL: @vmini_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vmini_hu(v8u16 _1) { return __builtin_lsx_vmini_hu(_1, 1); }
 // CHECK-LABEL: @vmini_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vmini_wu(v4u32 _1) { return __builtin_lsx_vmini_wu(_1, 1); }
 // CHECK-LABEL: @vmini_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vmini_du(v2u64 _1) { return __builtin_lsx_vmini_du(_1, 1); }
 // CHECK-LABEL: @vseq_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vseq_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vseq_b(_1, _2); }
 // CHECK-LABEL: @vseq_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vseq_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vseq_h(_1, _2); }
 // CHECK-LABEL: @vseq_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vseq_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vseq_w(_1, _2); }
 // CHECK-LABEL: @vseq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vseq_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vseq_d(_1, _2); }
 // CHECK-LABEL: @vseqi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vseqi_b(v16i8 _1) { return __builtin_lsx_vseqi_b(_1, 1); }
 // CHECK-LABEL: @vseqi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vseqi_h(v8i16 _1) { return __builtin_lsx_vseqi_h(_1, 1); }
 // CHECK-LABEL: @vseqi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vseqi_w(v4i32 _1) { return __builtin_lsx_vseqi_w(_1, 1); }
 // CHECK-LABEL: @vseqi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vseqi_d(v2i64 _1) { return __builtin_lsx_vseqi_d(_1, 1); }
 // CHECK-LABEL: @vslti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslti_b(v16i8 _1) { return __builtin_lsx_vslti_b(_1, 1); }
 // CHECK-LABEL: @vslt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vslt_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vslt_b(_1, _2); }
 // CHECK-LABEL: @vslt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vslt_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vslt_h(_1, _2); }
 // CHECK-LABEL: @vslt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vslt_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vslt_w(_1, _2); }
 // CHECK-LABEL: @vslt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vslt_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vslt_d(_1, _2); }
 // CHECK-LABEL: @vslti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslti_h(v8i16 _1) { return __builtin_lsx_vslti_h(_1, 1); }
 // CHECK-LABEL: @vslti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslti_w(v4i32 _1) { return __builtin_lsx_vslti_w(_1, 1); }
 // CHECK-LABEL: @vslti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslti_d(v2i64 _1) { return __builtin_lsx_vslti_d(_1, 1); }
 // CHECK-LABEL: @vslt_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vslt_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vslt_bu(_1, _2);
 }
 // CHECK-LABEL: @vslt_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vslt_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vslt_hu(_1, _2);
 }
 // CHECK-LABEL: @vslt_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vslt_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vslt_wu(_1, _2);
 }
 // CHECK-LABEL: @vslt_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vslt_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vslt_du(_1, _2);
 }
 // CHECK-LABEL: @vslti_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslti_bu(v16u8 _1) { return __builtin_lsx_vslti_bu(_1, 1); }
 // CHECK-LABEL: @vslti_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslti_hu(v8u16 _1) { return __builtin_lsx_vslti_hu(_1, 1); }
 // CHECK-LABEL: @vslti_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslti_wu(v4u32 _1) { return __builtin_lsx_vslti_wu(_1, 1); }
 // CHECK-LABEL: @vslti_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslti_du(v2u64 _1) { return __builtin_lsx_vslti_du(_1, 1); }
 // CHECK-LABEL: @vsle_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsle_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsle_b(_1, _2); }
 // CHECK-LABEL: @vsle_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsle_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsle_h(_1, _2); }
 // CHECK-LABEL: @vsle_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsle_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsle_w(_1, _2); }
 // CHECK-LABEL: @vsle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsle_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsle_d(_1, _2); }
 // CHECK-LABEL: @vslei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslei_b(v16i8 _1) { return __builtin_lsx_vslei_b(_1, 1); }
 // CHECK-LABEL: @vslei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslei_h(v8i16 _1) { return __builtin_lsx_vslei_h(_1, 1); }
 // CHECK-LABEL: @vslei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslei_w(v4i32 _1) { return __builtin_lsx_vslei_w(_1, 1); }
 // CHECK-LABEL: @vslei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslei_d(v2i64 _1) { return __builtin_lsx_vslei_d(_1, 1); }
 // CHECK-LABEL: @vsle_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsle_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vsle_bu(_1, _2);
 }
 // CHECK-LABEL: @vsle_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsle_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vsle_hu(_1, _2);
 }
 // CHECK-LABEL: @vsle_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsle_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vsle_wu(_1, _2);
 }
 // CHECK-LABEL: @vsle_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsle_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vsle_du(_1, _2);
 }
 // CHECK-LABEL: @vslei_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslei_bu(v16u8 _1) { return __builtin_lsx_vslei_bu(_1, 1); }
 // CHECK-LABEL: @vslei_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslei_hu(v8u16 _1) { return __builtin_lsx_vslei_hu(_1, 1); }
 // CHECK-LABEL: @vslei_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslei_wu(v4u32 _1) { return __builtin_lsx_vslei_wu(_1, 1); }
 // CHECK-LABEL: @vslei_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslei_du(v2u64 _1) { return __builtin_lsx_vslei_du(_1, 1); }
 // CHECK-LABEL: @vsat_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsat_b(v16i8 _1) { return __builtin_lsx_vsat_b(_1, 1); }
 // CHECK-LABEL: @vsat_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsat_h(v8i16 _1) { return __builtin_lsx_vsat_h(_1, 1); }
 // CHECK-LABEL: @vsat_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsat_w(v4i32 _1) { return __builtin_lsx_vsat_w(_1, 1); }
 // CHECK-LABEL: @vsat_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsat_d(v2i64 _1) { return __builtin_lsx_vsat_d(_1, 1); }
 // CHECK-LABEL: @vsat_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vsat_bu(v16u8 _1) { return __builtin_lsx_vsat_bu(_1, 1); }
 // CHECK-LABEL: @vsat_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vsat_hu(v8u16 _1) { return __builtin_lsx_vsat_hu(_1, 1); }
 // CHECK-LABEL: @vsat_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vsat_wu(v4u32 _1) { return __builtin_lsx_vsat_wu(_1, 1); }
 // CHECK-LABEL: @vsat_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vsat_du(v2u64 _1) { return __builtin_lsx_vsat_du(_1, 1); }
 // CHECK-LABEL: @vadda_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vadda_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vadda_b(_1, _2);
 }
 // CHECK-LABEL: @vadda_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vadda_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vadda_h(_1, _2);
 }
 // CHECK-LABEL: @vadda_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vadda_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vadda_w(_1, _2);
 }
 // CHECK-LABEL: @vadda_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadda_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vadda_d(_1, _2);
 }
 // CHECK-LABEL: @vsadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsadd_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsadd_b(_1, _2);
 }
 // CHECK-LABEL: @vsadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsadd_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsadd_h(_1, _2);
 }
 // CHECK-LABEL: @vsadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsadd_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsadd_w(_1, _2);
 }
 // CHECK-LABEL: @vsadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsadd_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsadd_d(_1, _2);
 }
 // CHECK-LABEL: @vsadd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vsadd_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vsadd_bu(_1, _2);
 }
 // CHECK-LABEL: @vsadd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vsadd_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vsadd_hu(_1, _2);
 }
 // CHECK-LABEL: @vsadd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vsadd_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vsadd_wu(_1, _2);
 }
 // CHECK-LABEL: @vsadd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vsadd_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vsadd_du(_1, _2);
 }
 // CHECK-LABEL: @vavg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vavg_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vavg_b(_1, _2); }
 // CHECK-LABEL: @vavg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vavg_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vavg_h(_1, _2); }
 // CHECK-LABEL: @vavg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vavg_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vavg_w(_1, _2); }
 // CHECK-LABEL: @vavg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vavg_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vavg_d(_1, _2); }
 // CHECK-LABEL: @vavg_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vavg_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vavg_bu(_1, _2);
 }
 // CHECK-LABEL: @vavg_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vavg_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vavg_hu(_1, _2);
 }
 // CHECK-LABEL: @vavg_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vavg_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vavg_wu(_1, _2);
 }
 // CHECK-LABEL: @vavg_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vavg_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vavg_du(_1, _2);
 }
 // CHECK-LABEL: @vavgr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vavgr_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vavgr_b(_1, _2);
 }
 // CHECK-LABEL: @vavgr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vavgr_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vavgr_h(_1, _2);
 }
 // CHECK-LABEL: @vavgr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vavgr_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vavgr_w(_1, _2);
 }
 // CHECK-LABEL: @vavgr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vavgr_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vavgr_d(_1, _2);
 }
 // CHECK-LABEL: @vavgr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vavgr_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vavgr_bu(_1, _2);
 }
 // CHECK-LABEL: @vavgr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vavgr_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vavgr_hu(_1, _2);
 }
 // CHECK-LABEL: @vavgr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vavgr_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vavgr_wu(_1, _2);
 }
 // CHECK-LABEL: @vavgr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vavgr_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vavgr_du(_1, _2);
 }
 // CHECK-LABEL: @vssub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssub_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssub_b(_1, _2);
 }
 // CHECK-LABEL: @vssub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssub_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssub_h(_1, _2);
 }
 // CHECK-LABEL: @vssub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssub_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssub_w(_1, _2);
 }
 // CHECK-LABEL: @vssub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssub_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssub_d(_1, _2);
 }
 // CHECK-LABEL: @vssub_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssub_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vssub_bu(_1, _2);
 }
 // CHECK-LABEL: @vssub_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssub_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssub_hu(_1, _2);
 }
 // CHECK-LABEL: @vssub_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssub_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssub_wu(_1, _2);
 }
 // CHECK-LABEL: @vssub_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssub_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssub_du(_1, _2);
 }
 // CHECK-LABEL: @vabsd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vabsd_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vabsd_b(_1, _2);
 }
 // CHECK-LABEL: @vabsd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vabsd_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vabsd_h(_1, _2);
 }
 // CHECK-LABEL: @vabsd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vabsd_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vabsd_w(_1, _2);
 }
 // CHECK-LABEL: @vabsd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vabsd_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vabsd_d(_1, _2);
 }
 // CHECK-LABEL: @vabsd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vabsd_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vabsd_bu(_1, _2);
 }
 // CHECK-LABEL: @vabsd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vabsd_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vabsd_hu(_1, _2);
 }
 // CHECK-LABEL: @vabsd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vabsd_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vabsd_wu(_1, _2);
 }
 // CHECK-LABEL: @vabsd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vabsd_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vabsd_du(_1, _2);
 }
 // CHECK-LABEL: @vmul_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmul_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmul_b(_1, _2); }
 // CHECK-LABEL: @vmul_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmul_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmul_h(_1, _2); }
 // CHECK-LABEL: @vmul_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmul_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmul_w(_1, _2); }
 // CHECK-LABEL: @vmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmul_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmul_d(_1, _2); }
 // CHECK-LABEL: @vmadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vmadd_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vmadd_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmadd_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vmadd_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmadd_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vmadd_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmadd_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vmsub_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vmsub_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmsub_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vmsub_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmsub_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vmsub_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmsub_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vdiv_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vdiv_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vdiv_b(_1, _2); }
 // CHECK-LABEL: @vdiv_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vdiv_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vdiv_h(_1, _2); }
 // CHECK-LABEL: @vdiv_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vdiv_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vdiv_w(_1, _2); }
 // CHECK-LABEL: @vdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vdiv_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vdiv_d(_1, _2); }
 // CHECK-LABEL: @vdiv_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vdiv_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vdiv_bu(_1, _2);
 }
 // CHECK-LABEL: @vdiv_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vdiv_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vdiv_hu(_1, _2);
 }
 // CHECK-LABEL: @vdiv_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vdiv_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vdiv_wu(_1, _2);
 }
 // CHECK-LABEL: @vdiv_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vdiv_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vdiv_du(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhaddw_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vhaddw_h_b(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhaddw_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vhaddw_w_h(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhaddw_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vhaddw_d_w(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vhaddw_hu_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vhaddw_hu_bu(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vhaddw_wu_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vhaddw_wu_hu(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhaddw_du_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vhaddw_du_wu(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhsubw_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vhsubw_h_b(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhsubw_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vhsubw_w_h(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vhsubw_d_w(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhsubw_hu_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vhsubw_hu_bu(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhsubw_wu_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vhsubw_wu_hu(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_du_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vhsubw_du_wu(_1, _2);
 }
 // CHECK-LABEL: @vmod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmod_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmod_b(_1, _2); }
 // CHECK-LABEL: @vmod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmod_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmod_h(_1, _2); }
 // CHECK-LABEL: @vmod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmod_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmod_w(_1, _2); }
 // CHECK-LABEL: @vmod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmod_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmod_d(_1, _2); }
 // CHECK-LABEL: @vmod_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmod_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmod_bu(_1, _2);
 }
 // CHECK-LABEL: @vmod_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmod_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmod_hu(_1, _2);
 }
 // CHECK-LABEL: @vmod_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmod_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmod_wu(_1, _2);
 }
 // CHECK-LABEL: @vmod_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmod_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmod_du(_1, _2);
 }
 // CHECK-LABEL: @vreplve_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vreplve_b(v16i8 _1, int _2) {
   return __builtin_lsx_vreplve_b(_1, _2);
 }
 // CHECK-LABEL: @vreplve_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vreplve_h(v8i16 _1, int _2) {
   return __builtin_lsx_vreplve_h(_1, _2);
 }
 // CHECK-LABEL: @vreplve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vreplve_w(v4i32 _1, int _2) {
   return __builtin_lsx_vreplve_w(_1, _2);
 }
 // CHECK-LABEL: @vreplve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vreplve_d(v2i64 _1, int _2) {
   return __builtin_lsx_vreplve_d(_1, _2);
 }
 // CHECK-LABEL: @vreplvei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vreplvei_b(v16i8 _1) { return __builtin_lsx_vreplvei_b(_1, 1); }
 // CHECK-LABEL: @vreplvei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vreplvei_h(v8i16 _1) { return __builtin_lsx_vreplvei_h(_1, 1); }
 // CHECK-LABEL: @vreplvei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vreplvei_w(v4i32 _1) { return __builtin_lsx_vreplvei_w(_1, 1); }
 // CHECK-LABEL: @vreplvei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vreplvei_d(v2i64 _1) { return __builtin_lsx_vreplvei_d(_1, 1); }
 // CHECK-LABEL: @vpickev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpickev_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vpickev_b(_1, _2);
 }
 // CHECK-LABEL: @vpickev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpickev_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vpickev_h(_1, _2);
 }
 // CHECK-LABEL: @vpickev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpickev_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpickev_w(_1, _2);
 }
 // CHECK-LABEL: @vpickev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpickev_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vpickev_d(_1, _2);
 }
 // CHECK-LABEL: @vpickod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpickod_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vpickod_b(_1, _2);
 }
 // CHECK-LABEL: @vpickod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpickod_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vpickod_h(_1, _2);
 }
 // CHECK-LABEL: @vpickod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpickod_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpickod_w(_1, _2);
 }
 // CHECK-LABEL: @vpickod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpickod_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vpickod_d(_1, _2);
 }
 // CHECK-LABEL: @vilvh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vilvh_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vilvh_b(_1, _2);
 }
 // CHECK-LABEL: @vilvh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vilvh_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vilvh_h(_1, _2);
 }
 // CHECK-LABEL: @vilvh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vilvh_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vilvh_w(_1, _2);
 }
 // CHECK-LABEL: @vilvh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vilvh_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vilvh_d(_1, _2);
 }
 // CHECK-LABEL: @vilvl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vilvl_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vilvl_b(_1, _2);
 }
 // CHECK-LABEL: @vilvl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vilvl_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vilvl_h(_1, _2);
 }
 // CHECK-LABEL: @vilvl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vilvl_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vilvl_w(_1, _2);
 }
 // CHECK-LABEL: @vilvl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vilvl_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vilvl_d(_1, _2);
 }
 // CHECK-LABEL: @vpackev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpackev_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vpackev_b(_1, _2);
 }
 // CHECK-LABEL: @vpackev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpackev_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vpackev_h(_1, _2);
 }
 // CHECK-LABEL: @vpackev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpackev_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpackev_w(_1, _2);
 }
 // CHECK-LABEL: @vpackev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpackev_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vpackev_d(_1, _2);
 }
 // CHECK-LABEL: @vpackod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpackod_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vpackod_b(_1, _2);
 }
 // CHECK-LABEL: @vpackod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpackod_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vpackod_h(_1, _2);
 }
 // CHECK-LABEL: @vpackod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpackod_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpackod_w(_1, _2);
 }
 // CHECK-LABEL: @vpackod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpackod_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vpackod_d(_1, _2);
 }
 // CHECK-LABEL: @vshuf_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vshuf_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vshuf_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vshuf_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vshuf_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vshuf_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vshuf_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vand_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vand_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vand_v(_1, _2); }
 // CHECK-LABEL: @vandi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vandi_b(v16u8 _1) { return __builtin_lsx_vandi_b(_1, 1); }
 // CHECK-LABEL: @vor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vor_v(_1, _2); }
 // CHECK-LABEL: @vori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vori_b(v16u8 _1) { return __builtin_lsx_vori_b(_1, 1); }
 // CHECK-LABEL: @vnor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vnor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vnor_v(_1, _2); }
 // CHECK-LABEL: @vnori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vnori_b(v16u8 _1) { return __builtin_lsx_vnori_b(_1, 1); }
 // CHECK-LABEL: @vxor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vxor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vxor_v(_1, _2); }
 // CHECK-LABEL: @vxori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vxori_b(v16u8 _1) { return __builtin_lsx_vxori_b(_1, 1); }
 // CHECK-LABEL: @vbitsel_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16u8 vbitsel_v(v16u8 _1, v16u8 _2, v16u8 _3) {
   return __builtin_lsx_vbitsel_v(_1, _2, _3);
 }
 // CHECK-LABEL: @vbitseli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitseli_b(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vbitseli_b(_1, _2, 1);
 }
 // CHECK-LABEL: @vshuf4i_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vshuf4i_b(v16i8 _1) { return __builtin_lsx_vshuf4i_b(_1, 1); }
 // CHECK-LABEL: @vshuf4i_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vshuf4i_h(v8i16 _1) { return __builtin_lsx_vshuf4i_h(_1, 1); }
 // CHECK-LABEL: @vshuf4i_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vshuf4i_w(v4i32 _1) { return __builtin_lsx_vshuf4i_w(_1, 1); }
 // CHECK-LABEL: @vreplgr2vr_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vreplgr2vr_b(int _1) { return __builtin_lsx_vreplgr2vr_b(_1); }
 // CHECK-LABEL: @vreplgr2vr_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vreplgr2vr_h(int _1) { return __builtin_lsx_vreplgr2vr_h(_1); }
 // CHECK-LABEL: @vreplgr2vr_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vreplgr2vr_w(int _1) { return __builtin_lsx_vreplgr2vr_w(_1); }
 // CHECK-LABEL: @vreplgr2vr_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vreplgr2vr_d(long _1) { return __builtin_lsx_vreplgr2vr_d(_1); }
 // CHECK-LABEL: @vpcnt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vpcnt_b(v16i8 _1) { return __builtin_lsx_vpcnt_b(_1); }
 // CHECK-LABEL: @vpcnt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vpcnt_h(v8i16 _1) { return __builtin_lsx_vpcnt_h(_1); }
 // CHECK-LABEL: @vpcnt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vpcnt_w(v4i32 _1) { return __builtin_lsx_vpcnt_w(_1); }
 // CHECK-LABEL: @vpcnt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vpcnt_d(v2i64 _1) { return __builtin_lsx_vpcnt_d(_1); }
 // CHECK-LABEL: @vclo_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vclo_b(v16i8 _1) { return __builtin_lsx_vclo_b(_1); }
 // CHECK-LABEL: @vclo_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vclo_h(v8i16 _1) { return __builtin_lsx_vclo_h(_1); }
 // CHECK-LABEL: @vclo_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vclo_w(v4i32 _1) { return __builtin_lsx_vclo_w(_1); }
 // CHECK-LABEL: @vclo_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vclo_d(v2i64 _1) { return __builtin_lsx_vclo_d(_1); }
 // CHECK-LABEL: @vclz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vclz_b(v16i8 _1) { return __builtin_lsx_vclz_b(_1); }
 // CHECK-LABEL: @vclz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vclz_h(v8i16 _1) { return __builtin_lsx_vclz_h(_1); }
 // CHECK-LABEL: @vclz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vclz_w(v4i32 _1) { return __builtin_lsx_vclz_w(_1); }
 // CHECK-LABEL: @vclz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vclz_d(v2i64 _1) { return __builtin_lsx_vclz_d(_1); }
 // CHECK-LABEL: @vpickve2gr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_b(v16i8 _1) { return __builtin_lsx_vpickve2gr_b(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_h(v8i16 _1) { return __builtin_lsx_vpickve2gr_h(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_w(v4i32 _1) { return __builtin_lsx_vpickve2gr_w(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long vpickve2gr_d(v2i64 _1) { return __builtin_lsx_vpickve2gr_d(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_bu(v16i8 _1) {
   return __builtin_lsx_vpickve2gr_bu(_1, 1);
 }
 // CHECK-LABEL: @vpickve2gr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_hu(v8i16 _1) {
   return __builtin_lsx_vpickve2gr_hu(_1, 1);
 }
 // CHECK-LABEL: @vpickve2gr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_wu(v4i32 _1) {
   return __builtin_lsx_vpickve2gr_wu(_1, 1);
 }
 // CHECK-LABEL: @vpickve2gr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int vpickve2gr_du(v2i64 _1) {
   return __builtin_lsx_vpickve2gr_du(_1, 1);
 }
 // CHECK-LABEL: @vinsgr2vr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vinsgr2vr_b(v16i8 _1) {
   return __builtin_lsx_vinsgr2vr_b(_1, 1, 1);
 }
 // CHECK-LABEL: @vinsgr2vr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vinsgr2vr_h(v8i16 _1) {
   return __builtin_lsx_vinsgr2vr_h(_1, 1, 1);
 }
 // CHECK-LABEL: @vinsgr2vr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vinsgr2vr_w(v4i32 _1) {
   return __builtin_lsx_vinsgr2vr_w(_1, 1, 1);
 }
 // CHECK-LABEL: @vinsgr2vr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[_1:%.*]], i64 1, i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[TMP0]], i64 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vinsgr2vr_d(v2i64 _1) {
   return __builtin_lsx_vinsgr2vr_d(_1, 1, 1);
 }
 // CHECK-LABEL: @vfadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfadd_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfadd_s(_1, _2);
 }
 // CHECK-LABEL: @vfadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfadd_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfadd_d(_1, _2);
 }
 // CHECK-LABEL: @vfsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfsub_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfsub_s(_1, _2);
 }
 // CHECK-LABEL: @vfsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfsub_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfsub_d(_1, _2);
 }
 // CHECK-LABEL: @vfmul_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmul_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmul_s(_1, _2);
 }
 // CHECK-LABEL: @vfmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmul_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmul_d(_1, _2);
 }
 // CHECK-LABEL: @vfdiv_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfdiv_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfdiv_s(_1, _2);
 }
 // CHECK-LABEL: @vfdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfdiv_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfdiv_d(_1, _2);
 }
 // CHECK-LABEL: @vfcvt_h_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vfcvt_h_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcvt_h_s(_1, _2);
 }
 // CHECK-LABEL: @vfcvt_s_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfcvt_s_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcvt_s_d(_1, _2);
 }
 // CHECK-LABEL: @vfmin_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmin_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmin_s(_1, _2);
 }
 // CHECK-LABEL: @vfmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmin_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmin_d(_1, _2);
 }
 // CHECK-LABEL: @vfmina_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmina_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmina_s(_1, _2);
 }
 // CHECK-LABEL: @vfmina_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmina_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmina_d(_1, _2);
 }
 // CHECK-LABEL: @vfmax_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmax_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmax_s(_1, _2);
 }
 // CHECK-LABEL: @vfmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmax_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmax_d(_1, _2);
 }
 // CHECK-LABEL: @vfmaxa_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmaxa_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmaxa_s(_1, _2);
 }
 // CHECK-LABEL: @vfmaxa_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmaxa_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmaxa_d(_1, _2);
 }
 // CHECK-LABEL: @vfclass_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfclass_s(v4f32 _1) { return __builtin_lsx_vfclass_s(_1); }
 // CHECK-LABEL: @vfclass_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfclass_d(v2f64 _1) { return __builtin_lsx_vfclass_d(_1); }
 // CHECK-LABEL: @vfsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfsqrt_s(v4f32 _1) { return __builtin_lsx_vfsqrt_s(_1); }
 // CHECK-LABEL: @vfsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfsqrt_d(v2f64 _1) { return __builtin_lsx_vfsqrt_d(_1); }
 // CHECK-LABEL: @vfrecip_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrecip_s(v4f32 _1) { return __builtin_lsx_vfrecip_s(_1); }
 // CHECK-LABEL: @vfrecip_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrecip_d(v2f64 _1) { return __builtin_lsx_vfrecip_d(_1); }
 // CHECK-LABEL: @vfrint_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrint_s(v4f32 _1) { return __builtin_lsx_vfrint_s(_1); }
 // CHECK-LABEL: @vfrint_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrint_d(v2f64 _1) { return __builtin_lsx_vfrint_d(_1); }
 // CHECK-LABEL: @vfrsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrsqrt_s(v4f32 _1) { return __builtin_lsx_vfrsqrt_s(_1); }
 // CHECK-LABEL: @vfrsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrsqrt_d(v2f64 _1) { return __builtin_lsx_vfrsqrt_d(_1); }
 // CHECK-LABEL: @vflogb_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vflogb_s(v4f32 _1) { return __builtin_lsx_vflogb_s(_1); }
 // CHECK-LABEL: @vflogb_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vflogb_d(v2f64 _1) { return __builtin_lsx_vflogb_d(_1); }
 // CHECK-LABEL: @vfcvth_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfcvth_s_h(v8i16 _1) { return __builtin_lsx_vfcvth_s_h(_1); }
 // CHECK-LABEL: @vfcvth_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfcvth_d_s(v4f32 _1) { return __builtin_lsx_vfcvth_d_s(_1); }
 // CHECK-LABEL: @vfcvtl_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfcvtl_s_h(v8i16 _1) { return __builtin_lsx_vfcvtl_s_h(_1); }
 // CHECK-LABEL: @vfcvtl_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfcvtl_d_s(v4f32 _1) { return __builtin_lsx_vfcvtl_d_s(_1); }
 // CHECK-LABEL: @vftint_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftint_w_s(v4f32 _1) { return __builtin_lsx_vftint_w_s(_1); }
 // CHECK-LABEL: @vftint_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftint_l_d(v2f64 _1) { return __builtin_lsx_vftint_l_d(_1); }
 // CHECK-LABEL: @vftint_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vftint_wu_s(v4f32 _1) { return __builtin_lsx_vftint_wu_s(_1); }
 // CHECK-LABEL: @vftint_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vftint_lu_d(v2f64 _1) { return __builtin_lsx_vftint_lu_d(_1); }
 // CHECK-LABEL: @vftintrz_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrz_w_s(v4f32 _1) { return __builtin_lsx_vftintrz_w_s(_1); }
 // CHECK-LABEL: @vftintrz_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrz_l_d(v2f64 _1) { return __builtin_lsx_vftintrz_l_d(_1); }
 // CHECK-LABEL: @vftintrz_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vftintrz_wu_s(v4f32 _1) { return __builtin_lsx_vftintrz_wu_s(_1); }
 // CHECK-LABEL: @vftintrz_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vftintrz_lu_d(v2f64 _1) { return __builtin_lsx_vftintrz_lu_d(_1); }
 // CHECK-LABEL: @vffint_s_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vffint_s_w(v4i32 _1) { return __builtin_lsx_vffint_s_w(_1); }
 // CHECK-LABEL: @vffint_d_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffint_d_l(v2i64 _1) { return __builtin_lsx_vffint_d_l(_1); }
 // CHECK-LABEL: @vffint_s_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vffint_s_wu(v4u32 _1) { return __builtin_lsx_vffint_s_wu(_1); }
 // CHECK-LABEL: @vffint_d_lu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffint_d_lu(v2u64 _1) { return __builtin_lsx_vffint_d_lu(_1); }
 // CHECK-LABEL: @vandn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vandn_v(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vandn_v(_1, _2);
 }
 // CHECK-LABEL: @vneg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vneg_b(v16i8 _1) { return __builtin_lsx_vneg_b(_1); }
 // CHECK-LABEL: @vneg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vneg_h(v8i16 _1) { return __builtin_lsx_vneg_h(_1); }
 // CHECK-LABEL: @vneg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vneg_w(v4i32 _1) { return __builtin_lsx_vneg_w(_1); }
 // CHECK-LABEL: @vneg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vneg_d(v2i64 _1) { return __builtin_lsx_vneg_d(_1); }
 // CHECK-LABEL: @vmuh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmuh_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmuh_b(_1, _2); }
 // CHECK-LABEL: @vmuh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmuh_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmuh_h(_1, _2); }
 // CHECK-LABEL: @vmuh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmuh_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmuh_w(_1, _2); }
 // CHECK-LABEL: @vmuh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmuh_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmuh_d(_1, _2); }
 // CHECK-LABEL: @vmuh_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmuh_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmuh_bu(_1, _2);
 }
 // CHECK-LABEL: @vmuh_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmuh_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmuh_hu(_1, _2);
 }
 // CHECK-LABEL: @vmuh_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmuh_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmuh_wu(_1, _2);
 }
 // CHECK-LABEL: @vmuh_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmuh_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmuh_du(_1, _2);
 }
 // CHECK-LABEL: @vsllwil_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsllwil_h_b(v16i8 _1) { return __builtin_lsx_vsllwil_h_b(_1, 1); }
 // CHECK-LABEL: @vsllwil_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsllwil_w_h(v8i16 _1) { return __builtin_lsx_vsllwil_w_h(_1, 1); }
 // CHECK-LABEL: @vsllwil_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsllwil_d_w(v4i32 _1) { return __builtin_lsx_vsllwil_d_w(_1, 1); }
 // CHECK-LABEL: @vsllwil_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vsllwil_hu_bu(v16u8 _1) {
   return __builtin_lsx_vsllwil_hu_bu(_1, 1);
 }
 // CHECK-LABEL: @vsllwil_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vsllwil_wu_hu(v8u16 _1) {
   return __builtin_lsx_vsllwil_wu_hu(_1, 1);
 }
 // CHECK-LABEL: @vsllwil_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vsllwil_du_wu(v4u32 _1) {
   return __builtin_lsx_vsllwil_du_wu(_1, 1);
 }
 // CHECK-LABEL: @vsran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsran_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsran_b_h(_1, _2);
 }
 // CHECK-LABEL: @vsran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsran_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsran_h_w(_1, _2);
 }
 // CHECK-LABEL: @vsran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsran_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsran_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssran_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssran_b_h(_1, _2);
 }
 // CHECK-LABEL: @vssran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssran_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssran_h_w(_1, _2);
 }
 // CHECK-LABEL: @vssran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssran_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssran_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssran_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssran_bu_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssran_bu_h(_1, _2);
 }
 // CHECK-LABEL: @vssran_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssran_hu_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssran_hu_w(_1, _2);
 }
 // CHECK-LABEL: @vssran_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssran_wu_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssran_wu_d(_1, _2);
 }
 // CHECK-LABEL: @vsrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrarn_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrarn_b_h(_1, _2);
 }
 // CHECK-LABEL: @vsrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrarn_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrarn_h_w(_1, _2);
 }
 // CHECK-LABEL: @vsrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrarn_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrarn_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrarn_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrarn_b_h(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrarn_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrarn_h_w(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrarn_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrarn_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrarn_bu_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssrarn_bu_h(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrarn_hu_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssrarn_hu_w(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrarn_wu_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssrarn_wu_d(_1, _2);
 }
 // CHECK-LABEL: @vsrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrln_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrln_b_h(_1, _2);
 }
 // CHECK-LABEL: @vsrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrln_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrln_h_w(_1, _2);
 }
 // CHECK-LABEL: @vsrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrln_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrln_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrln_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrln_bu_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssrln_bu_h(_1, _2);
 }
 // CHECK-LABEL: @vssrln_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrln_hu_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssrln_hu_w(_1, _2);
 }
 // CHECK-LABEL: @vssrln_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrln_wu_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssrln_wu_d(_1, _2);
 }
 // CHECK-LABEL: @vsrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlrn_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrlrn_b_h(_1, _2);
 }
 // CHECK-LABEL: @vsrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlrn_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrlrn_h_w(_1, _2);
 }
 // CHECK-LABEL: @vsrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlrn_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrlrn_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlrn_bu_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssrlrn_bu_h(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlrn_hu_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssrlrn_hu_w(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlrn_wu_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssrlrn_wu_d(_1, _2);
 }
 // CHECK-LABEL: @vfrstpi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vfrstpi_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vfrstpi_b(_1, _2, 1);
 }
 // CHECK-LABEL: @vfrstpi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vfrstpi_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vfrstpi_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vfrstp_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vfrstp_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vfrstp_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vfrstp_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vfrstp_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vfrstp_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf4i_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vshuf4i_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vshuf4i_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vbsrl_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vbsrl_v(v16i8 _1) { return __builtin_lsx_vbsrl_v(_1, 1); }
 // CHECK-LABEL: @vbsll_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vbsll_v(v16i8 _1) { return __builtin_lsx_vbsll_v(_1, 1); }
 // CHECK-LABEL: @vextrins_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vextrins_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vextrins_b(_1, _2, 1);
 }
 // CHECK-LABEL: @vextrins_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vextrins_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vextrins_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vextrins_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vextrins_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vextrins_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vextrins_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vextrins_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vextrins_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vmskltz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmskltz_b(v16i8 _1) { return __builtin_lsx_vmskltz_b(_1); }
 // CHECK-LABEL: @vmskltz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmskltz_h(v8i16 _1) { return __builtin_lsx_vmskltz_h(_1); }
 // CHECK-LABEL: @vmskltz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmskltz_w(v4i32 _1) { return __builtin_lsx_vmskltz_w(_1); }
 // CHECK-LABEL: @vmskltz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmskltz_d(v2i64 _1) { return __builtin_lsx_vmskltz_d(_1); }
 // CHECK-LABEL: @vsigncov_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsigncov_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsigncov_b(_1, _2);
 }
 // CHECK-LABEL: @vsigncov_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsigncov_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsigncov_h(_1, _2);
 }
 // CHECK-LABEL: @vsigncov_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsigncov_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsigncov_w(_1, _2);
 }
 // CHECK-LABEL: @vsigncov_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsigncov_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsigncov_d(_1, _2);
 }
 // CHECK-LABEL: @vfmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __builtin_lsx_vfmadd_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __builtin_lsx_vfmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __builtin_lsx_vfmsub_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __builtin_lsx_vfmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfnmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __builtin_lsx_vfnmadd_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfnmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __builtin_lsx_vfnmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfnmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __builtin_lsx_vfnmsub_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfnmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __builtin_lsx_vfnmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vftintrne_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrne_w_s(v4f32 _1) { return __builtin_lsx_vftintrne_w_s(_1); }
 // CHECK-LABEL: @vftintrne_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrne_l_d(v2f64 _1) { return __builtin_lsx_vftintrne_l_d(_1); }
 // CHECK-LABEL: @vftintrp_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrp_w_s(v4f32 _1) { return __builtin_lsx_vftintrp_w_s(_1); }
 // CHECK-LABEL: @vftintrp_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrp_l_d(v2f64 _1) { return __builtin_lsx_vftintrp_l_d(_1); }
 // CHECK-LABEL: @vftintrm_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrm_w_s(v4f32 _1) { return __builtin_lsx_vftintrm_w_s(_1); }
 // CHECK-LABEL: @vftintrm_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrm_l_d(v2f64 _1) { return __builtin_lsx_vftintrm_l_d(_1); }
 // CHECK-LABEL: @vftint_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftint_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftint_w_d(_1, _2);
 }
 // CHECK-LABEL: @vffint_s_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vffint_s_l(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vffint_s_l(_1, _2);
 }
 // CHECK-LABEL: @vftintrz_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrz_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftintrz_w_d(_1, _2);
 }
 // CHECK-LABEL: @vftintrp_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrp_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftintrp_w_d(_1, _2);
 }
 // CHECK-LABEL: @vftintrm_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrm_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftintrm_w_d(_1, _2);
 }
 // CHECK-LABEL: @vftintrne_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrne_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftintrne_w_d(_1, _2);
 }
 // CHECK-LABEL: @vftintl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintl_l_s(v4f32 _1) { return __builtin_lsx_vftintl_l_s(_1); }
 // CHECK-LABEL: @vftinth_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftinth_l_s(v4f32 _1) { return __builtin_lsx_vftinth_l_s(_1); }
 // CHECK-LABEL: @vffinth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffinth_d_w(v4i32 _1) { return __builtin_lsx_vffinth_d_w(_1); }
 // CHECK-LABEL: @vffintl_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffintl_d_w(v4i32 _1) { return __builtin_lsx_vffintl_d_w(_1); }
 // CHECK-LABEL: @vftintrzl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrzl_l_s(v4f32 _1) { return __builtin_lsx_vftintrzl_l_s(_1); }
 // CHECK-LABEL: @vftintrzh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrzh_l_s(v4f32 _1) { return __builtin_lsx_vftintrzh_l_s(_1); }
 // CHECK-LABEL: @vftintrpl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrpl_l_s(v4f32 _1) { return __builtin_lsx_vftintrpl_l_s(_1); }
 // CHECK-LABEL: @vftintrph_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrph_l_s(v4f32 _1) { return __builtin_lsx_vftintrph_l_s(_1); }
 // CHECK-LABEL: @vftintrml_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrml_l_s(v4f32 _1) { return __builtin_lsx_vftintrml_l_s(_1); }
 // CHECK-LABEL: @vftintrmh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrmh_l_s(v4f32 _1) { return __builtin_lsx_vftintrmh_l_s(_1); }
 // CHECK-LABEL: @vftintrnel_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrnel_l_s(v4f32 _1) {
   return __builtin_lsx_vftintrnel_l_s(_1);
 }
 // CHECK-LABEL: @vftintrneh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrneh_l_s(v4f32 _1) {
   return __builtin_lsx_vftintrneh_l_s(_1);
 }
 // CHECK-LABEL: @vfrintrne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrne_s(v4f32 _1) { return __builtin_lsx_vfrintrne_s(_1); }
 // CHECK-LABEL: @vfrintrne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrne_d(v2f64 _1) { return __builtin_lsx_vfrintrne_d(_1); }
 // CHECK-LABEL: @vfrintrz_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrz_s(v4f32 _1) { return __builtin_lsx_vfrintrz_s(_1); }
 // CHECK-LABEL: @vfrintrz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrz_d(v2f64 _1) { return __builtin_lsx_vfrintrz_d(_1); }
 // CHECK-LABEL: @vfrintrp_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrp_s(v4f32 _1) { return __builtin_lsx_vfrintrp_s(_1); }
 // CHECK-LABEL: @vfrintrp_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrp_d(v2f64 _1) { return __builtin_lsx_vfrintrp_d(_1); }
 // CHECK-LABEL: @vfrintrm_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrm_s(v4f32 _1) { return __builtin_lsx_vfrintrm_s(_1); }
 // CHECK-LABEL: @vfrintrm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrm_d(v2f64 _1) { return __builtin_lsx_vfrintrm_d(_1); }
 // CHECK-LABEL: @vstelm_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_b(v16i8 _1, void *_2) {
@@ -3349,7 +4577,8 @@ void vstelm_b(v16i8 _1, void *_2) {
 }
 // CHECK-LABEL: @vstelm_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[TMP0]], ptr [[_2:%.*]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_h(v8i16 _1, void *_2) {
@@ -3357,7 +4586,8 @@ void vstelm_h(v8i16 _1, void *_2) {
 }
 // CHECK-LABEL: @vstelm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[TMP0]], ptr [[_2:%.*]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_w(v4i32 _1, void *_2) {
@@ -3365,7 +4595,8 @@ void vstelm_w(v4i32 _1, void *_2) {
 }
 // CHECK-LABEL: @vstelm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[TMP0]], ptr [[_2:%.*]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_d(v2i64 _1, void *_2) {
@@ -3373,1286 +4604,1785 @@ void vstelm_d(v2i64 _1, void *_2) {
 }
 // CHECK-LABEL: @vaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vaddwev_d_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vaddwev_w_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vaddwev_h_b(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vaddwod_d_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vaddwod_w_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vaddwod_h_b(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vaddwev_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vaddwev_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vaddwev_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vaddwod_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vaddwod_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vaddwod_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_wu_w(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vaddwev_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_hu_h(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vaddwev_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_bu_b(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vaddwev_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_wu_w(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vaddwod_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_hu_h(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vaddwod_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_bu_b(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vaddwod_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsubwev_d_w(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwev_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsubwev_w_h(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwev_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsubwev_h_b(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsubwod_d_w(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwod_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsubwod_w_h(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwod_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsubwod_h_b(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vsubwev_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwev_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vsubwev_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwev_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vsubwev_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vsubwod_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwod_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vsubwod_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwod_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vsubwod_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vaddwev_q_d(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vaddwod_q_d(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vaddwev_q_du(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vaddwod_q_du(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsubwev_q_d(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsubwod_q_d(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vsubwev_q_du(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vsubwod_q_du(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_du_d(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vaddwev_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_du_d(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vaddwod_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vmulwev_d_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vmulwev_w_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vmulwev_h_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vmulwod_d_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vmulwod_w_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vmulwod_h_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmulwev_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmulwev_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmulwev_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmulwod_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmulwod_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmulwod_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_wu_w(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vmulwev_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_hu_h(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vmulwev_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_bu_b(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vmulwev_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_wu_w(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vmulwod_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_hu_h(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vmulwod_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_bu_b(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vmulwod_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vmulwev_q_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vmulwod_q_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmulwev_q_du(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmulwod_q_du(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_du_d(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vmulwev_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_du_d(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vmulwod_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhaddw_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vhaddw_q_d(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhaddw_qu_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vhaddw_qu_du(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vhsubw_q_d(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhsubw_qu_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vhsubw_qu_du(_1, _2);
 }
 // CHECK-LABEL: @vmaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vmaddwev_d_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwev_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vmaddwev_w_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwev_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vmaddwev_h_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwev_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
   return __builtin_lsx_vmaddwev_d_wu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4u32 vmaddwev_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
   return __builtin_lsx_vmaddwev_w_hu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8u16 vmaddwev_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
   return __builtin_lsx_vmaddwev_h_bu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vmaddwod_d_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwod_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vmaddwod_w_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwod_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vmaddwod_h_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwod_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
   return __builtin_lsx_vmaddwod_d_wu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4u32 vmaddwod_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
   return __builtin_lsx_vmaddwod_w_hu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8u16 vmaddwod_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
   return __builtin_lsx_vmaddwod_h_bu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
   return __builtin_lsx_vmaddwev_d_wu_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwev_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
   return __builtin_lsx_vmaddwev_w_hu_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwev_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
   return __builtin_lsx_vmaddwev_h_bu_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
   return __builtin_lsx_vmaddwod_d_wu_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwod_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
   return __builtin_lsx_vmaddwod_w_hu_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwod_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
   return __builtin_lsx_vmaddwod_h_bu_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vmaddwev_q_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vmaddwod_q_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwev_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
   return __builtin_lsx_vmaddwev_q_du(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwod_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
   return __builtin_lsx_vmaddwod_q_du(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
   return __builtin_lsx_vmaddwev_q_du_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
   return __builtin_lsx_vmaddwod_q_du_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vrotr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vrotr_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vrotr_b(_1, _2);
 }
 // CHECK-LABEL: @vrotr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vrotr_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vrotr_h(_1, _2);
 }
 // CHECK-LABEL: @vrotr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vrotr_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vrotr_w(_1, _2);
 }
 // CHECK-LABEL: @vrotr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vrotr_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vrotr_d(_1, _2);
 }
 // CHECK-LABEL: @vadd_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadd_q(v2i64 _1, v2i64 _2) { return __builtin_lsx_vadd_q(_1, _2); }
 // CHECK-LABEL: @vsub_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsub_q(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsub_q(_1, _2); }
 // CHECK-LABEL: @vldrepl_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vldrepl_b(void *_1) { return __builtin_lsx_vldrepl_b(_1, 1); }
 // CHECK-LABEL: @vldrepl_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vldrepl_h(void *_1) { return __builtin_lsx_vldrepl_h(_1, 2); }
 // CHECK-LABEL: @vldrepl_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vldrepl_w(void *_1) { return __builtin_lsx_vldrepl_w(_1, 4); }
 // CHECK-LABEL: @vldrepl_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vldrepl_d(void *_1) { return __builtin_lsx_vldrepl_d(_1, 8); }
 // CHECK-LABEL: @vmskgez_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmskgez_b(v16i8 _1) { return __builtin_lsx_vmskgez_b(_1); }
 // CHECK-LABEL: @vmsknz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmsknz_b(v16i8 _1) { return __builtin_lsx_vmsknz_b(_1); }
 // CHECK-LABEL: @vexth_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vexth_h_b(v16i8 _1) { return __builtin_lsx_vexth_h_b(_1); }
 // CHECK-LABEL: @vexth_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vexth_w_h(v8i16 _1) { return __builtin_lsx_vexth_w_h(_1); }
 // CHECK-LABEL: @vexth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vexth_d_w(v4i32 _1) { return __builtin_lsx_vexth_d_w(_1); }
 // CHECK-LABEL: @vexth_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vexth_q_d(v2i64 _1) { return __builtin_lsx_vexth_q_d(_1); }
 // CHECK-LABEL: @vexth_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vexth_hu_bu(v16u8 _1) { return __builtin_lsx_vexth_hu_bu(_1); }
 // CHECK-LABEL: @vexth_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vexth_wu_hu(v8u16 _1) { return __builtin_lsx_vexth_wu_hu(_1); }
 // CHECK-LABEL: @vexth_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vexth_du_wu(v4u32 _1) { return __builtin_lsx_vexth_du_wu(_1); }
 // CHECK-LABEL: @vexth_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vexth_qu_du(v2u64 _1) { return __builtin_lsx_vexth_qu_du(_1); }
 // CHECK-LABEL: @vrotri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vrotri_b(v16i8 _1) { return __builtin_lsx_vrotri_b(_1, 1); }
 // CHECK-LABEL: @vrotri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vrotri_h(v8i16 _1) { return __builtin_lsx_vrotri_h(_1, 1); }
 // CHECK-LABEL: @vrotri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vrotri_w(v4i32 _1) { return __builtin_lsx_vrotri_w(_1, 1); }
 // CHECK-LABEL: @vrotri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vrotri_d(v2i64 _1) { return __builtin_lsx_vrotri_d(_1, 1); }
 // CHECK-LABEL: @vextl_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vextl_q_d(v2i64 _1) { return __builtin_lsx_vextl_q_d(_1); }
 // CHECK-LABEL: @vsrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrlni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrlni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrlni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrlni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlrni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrlrni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlrni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrlrni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlrni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrlrni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlrni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrlrni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssrlni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrlni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlni_bu_h(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vssrlni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlni_hu_w(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlni_wu_d(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrlni_du_q(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlrni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssrlrni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlrni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlrni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlrni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlrni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrlrni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlrni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlrni_bu_h(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vssrlrni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlrni_hu_w(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlrni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlrni_wu_d(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlrni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrlrni_du_q(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlrni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrani_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrani_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrani_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrani_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrani_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrani_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrani_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrani_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrarni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrarni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrarni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrarni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrarni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrarni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrarni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrarni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrani_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssrani_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrani_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrani_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrani_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrani_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrani_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrani_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrani_bu_h(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vssrani_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrani_hu_w(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vssrani_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrani_wu_d(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vssrani_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrani_du_q(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vssrani_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrarni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssrarni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrarni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrarni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrarni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrarni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrarni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrarni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrarni_bu_h(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vssrarni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrarni_hu_w(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vssrarni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrarni_wu_d(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vssrarni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrarni_du_q(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vssrarni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vpermi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpermi_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpermi_w(_1, _2, 1);
@@ -4660,79 +6390,107 @@ v4i32 vpermi_w(v4i32 _1, v4i32 _2) {
 // CHECK-LABEL: @vld(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vld(void *_1) { return __builtin_lsx_vld(_1, 1); }
 // CHECK-LABEL: @vst(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void vst(v16i8 _1, void *_2) { return __builtin_lsx_vst(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlrn_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlrn_b_h(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlrn_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlrn_h_w(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlrn_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlrn_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrln_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrln_b_h(_1, _2);
 }
 // CHECK-LABEL: @vssrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrln_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrln_h_w(_1, _2);
 }
 // CHECK-LABEL: @vssrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrln_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrln_w_d(_1, _2);
 }
 // CHECK-LABEL: @vorn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vorn_v(v16i8 _1, v16i8 _2) { return __builtin_lsx_vorn_v(_1, _2); }
 // CHECK-LABEL: @vldi(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldi(i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vldi() { return __builtin_lsx_vldi(1); }
 // CHECK-LABEL: @vshuf_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vshuf_b(_1, _2, _3);
@@ -4740,429 +6498,575 @@ v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
 // CHECK-LABEL: @vldx(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vldx(void *_1) { return __builtin_lsx_vldx(_1, 1); }
 // CHECK-LABEL: @vstx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void vstx(v16i8 _1, void *_2) { return __builtin_lsx_vstx(_1, _2, 1); }
 // CHECK-LABEL: @vextl_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vextl_qu_du(v2u64 _1) { return __builtin_lsx_vextl_qu_du(_1); }
 // CHECK-LABEL: @bnz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_b(v16u8 _1) { return __builtin_lsx_bnz_b(_1); }
 // CHECK-LABEL: @bnz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_d(v2u64 _1) { return __builtin_lsx_bnz_d(_1); }
 // CHECK-LABEL: @bnz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_h(v8u16 _1) { return __builtin_lsx_bnz_h(_1); }
 // CHECK-LABEL: @bnz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_v(v16u8 _1) { return __builtin_lsx_bnz_v(_1); }
 // CHECK-LABEL: @bnz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_w(v4u32 _1) { return __builtin_lsx_bnz_w(_1); }
 // CHECK-LABEL: @bz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_b(v16u8 _1) { return __builtin_lsx_bz_b(_1); }
 // CHECK-LABEL: @bz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_d(v2u64 _1) { return __builtin_lsx_bz_d(_1); }
 // CHECK-LABEL: @bz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_h(v8u16 _1) { return __builtin_lsx_bz_h(_1); }
 // CHECK-LABEL: @bz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_v(v16u8 _1) { return __builtin_lsx_bz_v(_1); }
 // CHECK-LABEL: @bz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_w(v4u32 _1) { return __builtin_lsx_bz_w(_1); }
 // CHECK-LABEL: @vfcmp_caf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_caf_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_caf_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_caf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_caf_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_caf_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_ceq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_ceq_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_ceq_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_ceq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_ceq_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_ceq_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cle_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cle_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cle_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cle_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_clt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_clt_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_clt_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_clt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_clt_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_clt_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cne_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cne_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cne_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cne_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cor_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cor_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cor_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cor_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cueq_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cueq_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cueq_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cueq_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cule_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cule_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cule_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cule_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cult_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cult_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cult_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cult_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cun_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cun_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cune_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cune_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cune_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cune_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cun_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cun_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_saf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_saf_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_saf_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_saf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_saf_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_saf_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_seq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_seq_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_seq_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_seq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_seq_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_seq_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sle_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sle_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sle_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sle_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_slt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_slt_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_slt_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_slt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_slt_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_slt_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sne_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sne_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sne_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sne_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sor_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sor_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sor_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sor_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sueq_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sueq_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sueq_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sueq_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sule_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sule_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sule_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sule_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sult_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sult_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sult_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sult_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sun_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sun_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sune_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sune_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sune_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sune_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sun_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sun_s(_1, _2);
@@ -5170,24 +7074,28 @@ v4i32 vfcmp_sun_s(v4f32 _1, v4f32 _2) {
 // CHECK-LABEL: @vrepli_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vrepli_b() { return __builtin_lsx_vrepli_b(1); }
 // CHECK-LABEL: @vrepli_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vrepli_d() { return __builtin_lsx_vrepli_d(1); }
 // CHECK-LABEL: @vrepli_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vrepli_h() { return __builtin_lsx_vrepli_h(1); }
 // CHECK-LABEL: @vrepli_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vrepli_w() { return __builtin_lsx_vrepli_w(1); }

From 9e1ad3cff6a855fdfdc1d91323e2021726da04ea Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Sun, 31 Dec 2023 22:17:12 +0800
Subject: [PATCH 002/313] [RISCV] Remove blank lines at the end of testcases.
 NFC.

---
 llvm/test/CodeGen/RISCV/bittest.ll                   |  1 -
 llvm/test/CodeGen/RISCV/compress-inline-asm.ll       |  1 -
 llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll      |  1 -
 llvm/test/CodeGen/RISCV/div_minsize.ll               |  1 -
 llvm/test/CodeGen/RISCV/double-select-icmp.ll        |  1 -
 llvm/test/CodeGen/RISCV/float-imm.ll                 |  1 -
 llvm/test/CodeGen/RISCV/float-select-verify.ll       |  1 -
 llvm/test/CodeGen/RISCV/fmax-fmin.ll                 |  1 -
 llvm/test/CodeGen/RISCV/half-select-icmp.ll          |  1 -
 llvm/test/CodeGen/RISCV/init-array.ll                |  1 -
 llvm/test/CodeGen/RISCV/neg-abs.ll                   |  1 -
 llvm/test/CodeGen/RISCV/pr63816.ll                   |  1 -
 llvm/test/CodeGen/RISCV/reduction-formation.ll       |  1 -
 llvm/test/CodeGen/RISCV/rv32xtheadba.ll              |  1 -
 llvm/test/CodeGen/RISCV/rv32xtheadbs.ll              |  1 -
 llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll      |  1 -
 llvm/test/CodeGen/RISCV/rv64-patchpoint.ll           |  1 -
 llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll    |  1 -
 llvm/test/CodeGen/RISCV/rv64xtheadba.ll              |  1 -
 llvm/test/CodeGen/RISCV/rv64xtheadbs.ll              |  1 -
 llvm/test/CodeGen/RISCV/rvv/binop-splats.ll          |  1 -
 .../CodeGen/RISCV/rvv/concat-vector-insert-elt.ll    |  1 -
 .../CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll  |  1 -
 .../RISCV/rvv/fixed-vectors-reduction-formation.ll   |  2 --
 .../RISCV/rvv/fixed-vectors-strided-load-store.ll    |  1 -
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll |  1 -
 .../CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll   |  1 -
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll  |  1 -
 .../test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll |  1 -
 .../CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll     |  1 -
 .../CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll    |  1 -
 .../CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll    |  1 -
 llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll            |  1 -
 llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll |  1 -
 llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll |  1 -
 llvm/test/CodeGen/RISCV/rvv/vaesdf.ll                |  1 -
 llvm/test/CodeGen/RISCV/rvv/vaesdm.ll                |  1 -
 llvm/test/CodeGen/RISCV/rvv/vaesef.ll                |  1 -
 llvm/test/CodeGen/RISCV/rvv/vaesem.ll                |  1 -
 llvm/test/CodeGen/RISCV/rvv/vaesz.ll                 |  1 -
 llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll          |  1 -
 llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll              |  1 -
 llvm/test/CodeGen/RISCV/rvv/vector-splice.ll         |  1 -
 llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll          |  1 -
 llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll        |  1 -
 llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll           |  1 -
 llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll           |  1 -
 llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll           |  1 -
 llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll           |  1 -
 llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll           |  1 -
 llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll           |  1 -
 llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll          |  1 -
 llvm/test/CodeGen/RISCV/rvv/vsm4r.ll                 |  1 -
 llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll            |  1 -
 llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll        |  1 -
 llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll              |  1 -
 llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll             |  1 -
 llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll            |  1 -
 llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll            |  1 -
 llvm/test/CodeGen/RISCV/saverestore-scs.ll           |  1 -
 llvm/test/CodeGen/RISCV/split-urem-by-constant.ll    |  1 -
 llvm/test/CodeGen/RISCV/switch-width.ll              |  1 -
 llvm/test/CodeGen/RISCV/unroll-loop-cse.ll           |  1 -
 llvm/test/CodeGen/RISCV/xaluo.ll                     |  1 -
 llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll           |  1 -
 .../InterleavedAccess/RISCV/interleaved-accesses.ll  | 12 ------------
 66 files changed, 78 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll
index f560a112dd92b..a05c518bbf3a0 100644
--- a/llvm/test/CodeGen/RISCV/bittest.ll
+++ b/llvm/test/CodeGen/RISCV/bittest.ll
@@ -3521,4 +3521,3 @@ define void @bit_64_1_nz_branch_i64(i64 %0) {
 5:
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/compress-inline-asm.ll b/llvm/test/CodeGen/RISCV/compress-inline-asm.ll
index 848b1ad70c0e6..bb84ec6704a1b 100644
--- a/llvm/test/CodeGen/RISCV/compress-inline-asm.ll
+++ b/llvm/test/CodeGen/RISCV/compress-inline-asm.ll
@@ -12,4 +12,3 @@ define i32 @compress_test(i32 %a) {
   %2 = tail call i32 asm "add $0, $1, $2", "=r,r,r"(i32 %a, i32 %1)
   ret i32 %2
 }
-
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index 02072b3e4e5ca..9bfd30daf7d4e 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -978,4 +978,3 @@ entry:
 declare i64 @llvm.cttz.i64(i64, i1 immarg)
 declare i32 @llvm.cttz.i32(i32, i1 immarg)
 declare i64 @llvm.ctlz.i64(i64, i1 immarg)
-
diff --git a/llvm/test/CodeGen/RISCV/div_minsize.ll b/llvm/test/CodeGen/RISCV/div_minsize.ll
index 6bc3bc7134e73..601821b932a52 100644
--- a/llvm/test/CodeGen/RISCV/div_minsize.ll
+++ b/llvm/test/CodeGen/RISCV/div_minsize.ll
@@ -68,4 +68,3 @@ define i32 @testsize4(i32 %x) minsize nounwind {
   %div = udiv i32 %x, 33
   ret i32 %div
 }
-
diff --git a/llvm/test/CodeGen/RISCV/double-select-icmp.ll b/llvm/test/CodeGen/RISCV/double-select-icmp.ll
index cd28b589ff76c..d864ff51b4669 100644
--- a/llvm/test/CodeGen/RISCV/double-select-icmp.ll
+++ b/llvm/test/CodeGen/RISCV/double-select-icmp.ll
@@ -508,4 +508,3 @@ define double @select_icmp_sgt_zero(i32 signext %a) {
   %2 = select i1 %1, double 0.000000e+00, double 1.000000e+00
   ret double %2
 }
-
diff --git a/llvm/test/CodeGen/RISCV/float-imm.ll b/llvm/test/CodeGen/RISCV/float-imm.ll
index a25955e2ef349..c38416d994ba5 100644
--- a/llvm/test/CodeGen/RISCV/float-imm.ll
+++ b/llvm/test/CodeGen/RISCV/float-imm.ll
@@ -73,4 +73,3 @@ define float @float_negative_zero(ptr %pf) nounwind {
 ; CHECKZFINX-NEXT:    ret
   ret float -0.0
 }
-
diff --git a/llvm/test/CodeGen/RISCV/float-select-verify.ll b/llvm/test/CodeGen/RISCV/float-select-verify.ll
index 6f414f16163b6..b38560fa41364 100644
--- a/llvm/test/CodeGen/RISCV/float-select-verify.ll
+++ b/llvm/test/CodeGen/RISCV/float-select-verify.ll
@@ -90,4 +90,3 @@ declare void @foo(i64)
 declare void @bar(float)
 
 declare float @llvm.round.f32(float)
-
diff --git a/llvm/test/CodeGen/RISCV/fmax-fmin.ll b/llvm/test/CodeGen/RISCV/fmax-fmin.ll
index aac5e5efc2cf5..b67093d72439c 100644
--- a/llvm/test/CodeGen/RISCV/fmax-fmin.ll
+++ b/llvm/test/CodeGen/RISCV/fmax-fmin.ll
@@ -304,4 +304,3 @@ declare float @llvm.maxnum.f32(float, float)
 declare double @llvm.maxnum.f64(double, double)
 declare float @llvm.minnum.f32(float, float)
 declare double @llvm.minnum.f64(double, double)
-
diff --git a/llvm/test/CodeGen/RISCV/half-select-icmp.ll b/llvm/test/CodeGen/RISCV/half-select-icmp.ll
index a2da15e1fe165..0e4030049e919 100644
--- a/llvm/test/CodeGen/RISCV/half-select-icmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-select-icmp.ll
@@ -537,4 +537,3 @@ define half @select_icmp_sgt_zero(i32 signext %a) {
   %2 = select i1 %1, half 0.000000e+00, half 1.000000e+00
   ret half %2
 }
-
diff --git a/llvm/test/CodeGen/RISCV/init-array.ll b/llvm/test/CodeGen/RISCV/init-array.ll
index 7935f320e803f..b514e6fe8f4e0 100644
--- a/llvm/test/CodeGen/RISCV/init-array.ll
+++ b/llvm/test/CodeGen/RISCV/init-array.ll
@@ -27,4 +27,3 @@ define internal void @_GLOBAL__I_a() section ".text.startup" {
 
 ;CTOR: .section .ctors
 ;CTOR-NOT:  section .init_array
-
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index 06be6cbd96410..6f301882b452c 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -256,4 +256,3 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
   %neg = sub nsw i64 0, %abs
   ret i64 %neg
 }
-
diff --git a/llvm/test/CodeGen/RISCV/pr63816.ll b/llvm/test/CodeGen/RISCV/pr63816.ll
index 21730dfcf13bc..6eaec08abb909 100644
--- a/llvm/test/CodeGen/RISCV/pr63816.ll
+++ b/llvm/test/CodeGen/RISCV/pr63816.ll
@@ -80,4 +80,3 @@ define void @test(ptr %0, ptr %1) nounwind {
   store <8 x double> %V2, ptr %1
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll
index b2dea4237f5a5..6b4dc0cd3699e 100644
--- a/llvm/test/CodeGen/RISCV/reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll
@@ -100,4 +100,3 @@ define i32 @reduce_or_4xi32(<4 x i32> %v) {
   %or2 = or i32 %or1, %e3
   ret i32 %or2
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
index 6d302fcfc78d3..3bf7704dd1836 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
@@ -320,4 +320,3 @@ define i32 @mul288(i32 %a) {
   %c = mul i32 %a, 288
   ret i32 %c
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbs.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbs.ll
index 1c8727668f685..34301ba5aadc9 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbs.ll
@@ -73,4 +73,3 @@ define i64 @th_tst_i64_cmp(i64 %a) nounwind {
   %zext = zext i1 %cmp to i64
   ret i64 %zext
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
index 20eb4cc10f405..62ea25218e2d7 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
@@ -1309,4 +1309,3 @@ declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
-
diff --git a/llvm/test/CodeGen/RISCV/rv64-patchpoint.ll b/llvm/test/CodeGen/RISCV/rv64-patchpoint.ll
index 3fbd13f1f259b..d2a3bccfef7bb 100644
--- a/llvm/test/CodeGen/RISCV/rv64-patchpoint.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-patchpoint.ll
@@ -21,4 +21,3 @@ entry:
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
-
diff --git a/llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll b/llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll
index 70a4c1726ca37..25278caee64d7 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll
@@ -55,4 +55,3 @@ entry:
   %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %a, metadata !"fpexcept.strict")
   ret i32 %conv
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index ed2b159d0ae7b..6f56babf28f5e 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -316,4 +316,3 @@ define i64 @mul288(i64 %a) {
   %c = mul i64 %a, 288
   ret i64 %c
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbs.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbs.ll
index f36c618af199b..4e20594f2b299 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbs.ll
@@ -69,4 +69,3 @@ define i64 @th_tst_i64_cmp(i64 %a) nounwind {
   %zext = zext i1 %cmp to i64
   ret i64 %zext
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
index f9e6f5fe06df9..6875925adad83 100644
--- a/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
@@ -619,4 +619,3 @@ define <vscale x 1 x double> @nxv2f64(double %x, double %y) {
   %v = fadd <vscale x 1 x double> %splat.x, %splat.y
   ret <vscale x 1 x double> %v
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll b/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
index 1e83b0b0dea86..bd65ed52be680 100644
--- a/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
@@ -219,4 +219,3 @@ define void @v4xi64_concat_vector_insert_idx3(ptr %a, ptr %b, i64 %x) {
   store <4 x i64> %ins, ptr %a
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
index 5d88bc02f1f45..ee8c322961c7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
@@ -636,4 +636,3 @@ define <1 x double> @v2f64(double %x, double %y) {
   %v = fadd <1 x double> %splat.x, %splat.y
   ret <1 x double> %v
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index fd4a54b468f15..2a0ec47a3de01 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -982,5 +982,3 @@ define float @reduce_fadd_4xi32_non_associative2(ptr %p) {
   %fadd2 = fadd fast float %fadd1, %e3
   ret float %fadd2
 }
-
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index b359f71be0e67..72f2447e80c24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -1006,4 +1006,3 @@ vector.body:                                      ; preds = %vector.body, %entry
 for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index 32ef08101407d..65776339de07a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -1037,4 +1037,3 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
   %v = call <32 x double> @llvm.vp.fma.v32f64(<32 x double> %va, <32 x double> %b, <32 x double> %c, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index e05d6b1525eee..28ab179048ca1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -801,4 +801,3 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
   %v = call <32 x double> @llvm.vp.fmuladd.v32f64(<32 x double> %va, <32 x double> %b, <32 x double> %c, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
index 2453e5423e13f..d57c012fec1ea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
@@ -923,4 +923,3 @@ define <2 x i64> @vwmulu_vx_v2i64_i64(ptr %x, ptr %y) {
   %g = mul <2 x i64> %e, %f
   ret <2 x i64> %g
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll
index 4c5b5cfbd9d96..7a20dd1d32b7e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll
@@ -1849,4 +1849,3 @@ entry:
 }
 
 declare <16 x float> @llvm.riscv.sf.vc.v.i.se.nxv16f32.iXLen.iXLen(iXLen, iXLen, iXLen, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll
index e845dffecff8e..b553a62ae496a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll
@@ -3669,4 +3669,3 @@ entry:
 }
 
 declare <16 x float> @llvm.riscv.sf.vc.v.fv.se.nxv16f32.nxv16f32.iXLen.f32(iXLen, <16 x float>, float, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll
index 2a9153fc0f82d..44ffffc7e59d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll
@@ -3669,4 +3669,3 @@ entry:
 }
 
 declare <16 x float> @llvm.riscv.sf.vc.v.fvv.se.nxv16f32.nxv16f32.nxv16i32.f32.iXLen(iXLen, <16 x float>, <16 x i32>, float %rs1, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll
index ec34791e6572a..ea6b936843c2f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll
@@ -2694,4 +2694,3 @@ entry:
 }
 
 declare <8 x double> @llvm.riscv.sf.vc.v.fvw.se.nxv8f64.nxv8f32.nxv8i32.f32.iXLen(iXLen, <8 x double>, <8 x i32>, float, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
index 242034f9826cb..52322f64416ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
@@ -342,4 +342,3 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
     ret <vscale x 4 x i64> %x
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
index d8b312e9def43..ef5ad1f651fa9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
@@ -726,4 +726,3 @@ define <vscale x 8 x i1> @trunc_nxv8i64_nxv8i1(<vscale x 8 x i64> %v) {
   %r = trunc <vscale x 8 x i64> %v to <vscale x 8 x i1>
   ret <vscale x 8 x i1> %r
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
index aae935e099fe4..aaac0c7b5a41a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
@@ -726,4 +726,3 @@ define <vscale x 8 x i1> @trunc_nxv8i64_nxv8i1(<vscale x 8 x i64> %v) {
   %r = trunc <vscale x 8 x i64> %v to <vscale x 8 x i1>
   ret <vscale x 8 x i1> %r
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesdf.ll b/llvm/test/CodeGen/RISCV/rvv/vaesdf.ll
index 1ad30fa264e0c..9d394a1ee3ff7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesdf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesdf.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesdm.ll b/llvm/test/CodeGen/RISCV/rvv/vaesdm.ll
index b7a4a11663310..f21bdcac032f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesdm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesdm.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesef.ll b/llvm/test/CodeGen/RISCV/rvv/vaesef.ll
index bd8a7cb94ab0f..ee11786583d7f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesef.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesef.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesem.ll b/llvm/test/CodeGen/RISCV/rvv/vaesem.ll
index 21b5e5942e319..65486e119842b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesem.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesem.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesz.ll b/llvm/test/CodeGen/RISCV/rvv/vaesz.ll
index ee089f1e77cc3..2453119ce92d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesz.ll
@@ -63,4 +63,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index a1b1c015369c8..f9c53c93472a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2003,4 +2003,3 @@ define <vscale x 8 x i64> @vandn_vx_swapped_nxv8i64(i64 %x, <vscale x 8 x i64> %
   %b = and <vscale x 8 x i64> %splat, %y
   ret <vscale x 8 x i64> %b
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
index 4ff6e5660b25d..f076c3c621cdb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
@@ -1429,4 +1429,3 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
   %x = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> %b, <vscale x 8 x i64> %splat.not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i64> %x
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index f68a15a0d0149..c98242437f629 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -2370,4 +2370,3 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_max(<vscale x 8 x double> %a
 }
 
 attributes #0 = { vscale_range(2,0) }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
index 9f32efd5027d7..c2ac155304dd5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
@@ -617,4 +617,3 @@ define <vscale x 8 x i64> @vzext_nxv8i32_nxv8i64(<vscale x 8 x i32> %va) {
   %evec = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll
index 1278f7790a063..d243c89958c54 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll
@@ -476,4 +476,3 @@ define <vscale x 16 x i1> @vmorn_vv_nxv16i1(<vscale x 16 x i1> %va, <vscale x 16
   %vc = or <vscale x 16 x i1> %va, %not
   ret <vscale x 16 x i1> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll
index b4041f0b67839..1247f3d29c099 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll
@@ -889,4 +889,3 @@ define <vscale x 8 x i64> @vmax_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = select <vscale x 8 x i1> %cmp, <vscale x 8 x i64> %va, <vscale x 8 x i64> %splat
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll
index 5398803b24899..7405282a07b70 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll
@@ -889,4 +889,3 @@ define <vscale x 8 x i64> @vmin_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = select <vscale x 8 x i1> %cmp, <vscale x 8 x i64> %va, <vscale x 8 x i64> %splat
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
index f1f7cdd47626c..3e14058210e51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
@@ -1004,4 +1004,3 @@ define <vscale x 8 x i32> @vmul_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = mul <vscale x 8 x i32> %va, %vs
   ret <vscale x 8 x i32> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
index 58874fe8c8fca..1eafb3bdfed2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
@@ -1299,4 +1299,3 @@ define <vscale x 8 x i64> @vrem_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = srem <vscale x 8 x i64> %va, %splat
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
index fc6af87e473ef..4ea5a6709db5c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
@@ -1154,4 +1154,3 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
   %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> %b.splat)
   ret <vscale x 8 x i64> %x
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index 13a584a673287..b8a091b242591 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -1976,4 +1976,3 @@ define <vscale x 8 x i64> @vror_vi_rotl_nxv8i64(<vscale x 8 x i64> %a) {
   %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i64> %x
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll
index 43705631e545a..127d3ffd79315 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll
@@ -581,4 +581,3 @@ define <vscale x 8 x i64> @vrsub_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = sub <vscale x 8 x i64> %splat, %va
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsm4r.ll b/llvm/test/CodeGen/RISCV/rvv/vsm4r.ll
index cb836596fdfd1..bcea335deefa7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsm4r.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsm4r.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll b/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll
index 7b8c2ae3e29bc..f462f242f0685 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll
@@ -209,4 +209,3 @@ define <vscale x 4 x i1> @splat_idx_nxv4i32(<vscale x 4 x i1> %v, i64 %idx) {
   %splat = shufflevector <vscale x 4 x i1> %ins, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   ret <vscale x 4 x i1> %splat
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll
index 337761f69bf90..e86d6045df297 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll
@@ -313,4 +313,3 @@ define <vscale x 8 x i32> @vtrunc_nxv8i64_nxv8i32(<vscale x 8 x i64> %va) {
   %tvec = trunc <vscale x 8 x i64> %va to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %tvec
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll
index e304b19e71c8f..e9e7fa077b52b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll
@@ -2239,4 +2239,3 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.sf.vc.v.i.se.nxv16f32.iXLen.iXLen(iXLen, iXLen, iXLen, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll
index 354ca3ad973f7..23628a98feb7c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll
@@ -3669,4 +3669,3 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.sf.vc.v.fv.se.nxv16f32.nxv16f32.iXLen.f32(iXLen, <vscale x 16 x float>, float, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll
index 95fb51e3fa5b6..2c9100111fab6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll
@@ -3687,4 +3687,3 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.sf.vc.v.fvv.se.nxv16f32.nxv16f32.nxv16i32.f32.iXLen(iXLen, <vscale x 16 x float>, <vscale x 16 x i32>, float %rs1, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll
index 4c0833b2ff0cb..29b9238b8e9c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll
@@ -2694,4 +2694,3 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.sf.vc.v.fvw.se.nxv8f64.nxv8f32.nxv8i32.f32.iXLen(iXLen, <vscale x 8 x double>, <vscale x 8 x i32>, float, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/saverestore-scs.ll b/llvm/test/CodeGen/RISCV/saverestore-scs.ll
index 6c3ee2c2f383f..6072943b38477 100644
--- a/llvm/test/CodeGen/RISCV/saverestore-scs.ll
+++ b/llvm/test/CodeGen/RISCV/saverestore-scs.ll
@@ -37,4 +37,3 @@ define void @callee_scs() nounwind shadowcallstack {
   store volatile [30 x i32] %val, ptr @var2
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 606dbf9a6c572..cdfb1ef0ab4d6 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -393,4 +393,3 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/switch-width.ll b/llvm/test/CodeGen/RISCV/switch-width.ll
index 8eed5130fa068..d902bd3276a3c 100644
--- a/llvm/test/CodeGen/RISCV/switch-width.ll
+++ b/llvm/test/CodeGen/RISCV/switch-width.ll
@@ -314,4 +314,3 @@ return:
   %retval = phi i32 [ -1, %sw.default ], [ 0, %sw.bb0 ], [ 1, %sw.bb1 ]
   ret i32 %retval
 }
-
diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
index 48a2569298087..2fd4572d23456 100644
--- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
+++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
@@ -89,4 +89,3 @@ define signext i32 @unroll_loop_cse() {
   %26 = phi i32 [ 1, %0 ], [ 1, %4 ], [ 1, %8 ], [ 1, %12 ], [ 1, %16 ], [ %24, %20 ]
   ret i32 %26
 }
-
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 85d28122537ea..f878d17d5f1da 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -5927,4 +5927,3 @@ declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
-
diff --git a/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll b/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
index b94c50dd7e8c8..74bf1a8929cbc 100644
--- a/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
+++ b/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
@@ -315,4 +315,3 @@ define i1 @no_same_ops(i64 %c, i64 %a, i64 %b) {
   %res = or i1 %l0, %l1
   ret i1 %res
 }
-
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index ea2d1ca722ca0..9ae9245eb15d8 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -364,15 +364,3 @@ define void @load_factor2_fp128(ptr %ptr) {
   %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
   ret void
 }
-
-
-
-
-
-
-
-
-
-
-
-

From 120b0bfbf0bade430fa9b19d78025ccd1d6148d0 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie0621@gmail.com>
Date: Tue, 2 Jan 2024 07:03:06 +0000
Subject: [PATCH 003/313] [libc++][ranges][abi-break] Fix `movable_box`
 overwriting memory of data that lives in the tail padding (#71314)

fixes #70506

The detailed problem description is in #70506

The original proposed fix was to remove `[[no_unique_address]]` except
when `_Tp` is empty.

Edit:
After the discussion in the comments below, the new fix here is to
remove the `[[no_unique_address]]` from `movable_box` in the cases where
we need to add our own assignment operator, which has contains the
problematic `construct_at`
---
 libcxx/docs/ReleaseNotes/18.rst               | 12 +++
 libcxx/include/__config                       | 10 +++
 libcxx/include/__ranges/chunk_by_view.h       |  3 +-
 libcxx/include/__ranges/drop_while_view.h     |  3 +-
 libcxx/include/__ranges/filter_view.h         |  2 +-
 libcxx/include/__ranges/movable_box.h         | 62 ++++++++++++---
 libcxx/include/__ranges/repeat_view.h         |  4 +-
 libcxx/include/__ranges/single_view.h         |  4 +-
 libcxx/include/__ranges/take_while_view.h     |  3 +-
 libcxx/include/__ranges/transform_view.h      |  3 +-
 .../no_unique_address.compile.pass.cpp        | 46 +++++++++++
 .../range.move.wrap/empty_object.pass.cpp     | 56 ++++++++++++++
 .../no_unique_address.pass.cpp                | 76 ++++++++++++-------
 .../no_unique_address.compile.pass.cpp        | 23 ++++++
 .../no_unique_address.compile.pass.cpp        | 23 ++++++
 .../range.lazy.split/ctor.range.pass.cpp      | 40 +++++-----
 .../range.transform/general.pass.cpp          | 17 +++++
 ..._robust_against_no_unique_address.pass.cpp | 70 +++++++++++++++++
 18 files changed, 390 insertions(+), 67 deletions(-)
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/no_unique_address.compile.pass.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/empty_object.pass.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.factories/range.repeat.view/no_unique_address.compile.pass.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.factories/range.single.view/no_unique_address.compile.pass.cpp
 create mode 100644 libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index fa60a581652d6..f5cda9aaa5dcb 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -170,6 +170,18 @@ ABI Affecting Changes
   to throw a different exception when attempting allocations that are too large
   (``std::bad_alloc`` vs ``std::length_error``).
 
+- The layout of some views inside ``std::ranges`` that use the ``movable-box`` exposition-only type as an implementation 
+  detail has changed in order to fix a bug which could result in overwriting user data following the ``movable-box``
+  <https://github.com/llvm/llvm-project/issues/70506>. 
+  This was caused by incorrect usage of the ``[[no_unique_address]]`` attribute inside the implementation of ``movable-box``. 
+  This only affects the layout of the following views: ``take_while_view``, ``filter_view``, ``single_view``, ``drop_while_view``, 
+  ``repeat_view``, ``transform_view``, ``chunk_by_view``. In order to avoid silent breakage, an ABI tag has been added to 
+  these views such that their mangled name will be different starting in this version of libc++. 
+  As a result, attempting to call a function that expects one of these views will fail to link until the code has been rebuilt 
+  against a matching version of libc++. In practice, we believe it is unusual for these views to appear at ABI boundaries so this 
+  should not be a major problem for most users. However it is probably worth auditing ranges-heavy code for ABI boundaries that 
+  would contain these views, or for types that contain these views as members and which are passed across ABI boundaries.
+
 Build System Changes
 --------------------
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index adff13e714cb6..40e6da8bc03af 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -200,6 +200,16 @@
 #    define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
 #  endif
 
+// We had some bugs where we use [[no_unique_address]] together with construct_at,
+// which causes UB as the call on construct_at could write to overlapping subobjects
+//
+// https://github.com/llvm/llvm-project/issues/70506
+// https://github.com/llvm/llvm-project/issues/70494
+//
+// To fix the bug we had to change the ABI of some classes to remove [[no_unique_address]] under certain conditions.
+// The below macro is used for all classes whose ABI have changed as part of fixing these bugs.
+#  define _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG __attribute__((__abi_tag__("subobj_fix_2023")))
+
 // Changes the iterator type of select containers (see below) to a bounded iterator that keeps track of whether it's
 // within the bounds of the original container and asserts it on every dereference.
 //
diff --git a/libcxx/include/__ranges/chunk_by_view.h b/libcxx/include/__ranges/chunk_by_view.h
index e469986876469..3ecc018cac9d9 100644
--- a/libcxx/include/__ranges/chunk_by_view.h
+++ b/libcxx/include/__ranges/chunk_by_view.h
@@ -54,7 +54,8 @@ namespace ranges {
 
 template <forward_range _View, indirect_binary_predicate<iterator_t<_View>, iterator_t<_View>> _Pred>
   requires view<_View> && is_object_v<_Pred>
-class chunk_by_view : public view_interface<chunk_by_view<_View, _Pred>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG chunk_by_view
+    : public view_interface<chunk_by_view<_View, _Pred>> {
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
diff --git a/libcxx/include/__ranges/drop_while_view.h b/libcxx/include/__ranges/drop_while_view.h
index 677b5bc66d442..eb3783eb42f14 100644
--- a/libcxx/include/__ranges/drop_while_view.h
+++ b/libcxx/include/__ranges/drop_while_view.h
@@ -45,7 +45,8 @@ namespace ranges {
 
 template <view _View, class _Pred>
   requires input_range<_View> && is_object_v<_Pred> && indirect_unary_predicate<const _Pred, iterator_t<_View>>
-class drop_while_view : public view_interface<drop_while_view<_View, _Pred>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG drop_while_view
+    : public view_interface<drop_while_view<_View, _Pred>> {
 public:
   _LIBCPP_HIDE_FROM_ABI drop_while_view()
     requires default_initializable<_View> && default_initializable<_Pred>
diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h
index 08d50ab011042..868ad128e8944 100644
--- a/libcxx/include/__ranges/filter_view.h
+++ b/libcxx/include/__ranges/filter_view.h
@@ -51,7 +51,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 template <input_range _View, indirect_unary_predicate<iterator_t<_View>> _Pred>
   requires view<_View> && is_object_v<_Pred>
-class filter_view : public view_interface<filter_view<_View, _Pred>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG filter_view : public view_interface<filter_view<_View, _Pred>> {
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
diff --git a/libcxx/include/__ranges/movable_box.h b/libcxx/include/__ranges/movable_box.h
index 6615533d37434..9b38877494eaa 100644
--- a/libcxx/include/__ranges/movable_box.h
+++ b/libcxx/include/__ranges/movable_box.h
@@ -134,6 +134,20 @@ concept __doesnt_need_empty_state =
          // 2. Otherwise, movable-box<T> should store only a T if either T models movable or
          //    is_nothrow_move_constructible_v<T> is true.
          : movable<_Tp> || is_nothrow_move_constructible_v<_Tp>);
+
+// When _Tp doesn't have an assignment operator, we must implement __movable_box's assignment operator
+// by doing destroy_at followed by construct_at. However, that implementation strategy leads to UB if the nested
+// _Tp is potentially overlapping, as it is doing a non-transparent replacement of the sub-object, which means that
+// we're not considered "nested" inside the movable-box anymore, and since we're not nested within it, [basic.life]/1.5
+// says that we essentially just reused the storage of the movable-box for a completely unrelated object and ended the
+// movable-box's lifetime.
+// https://github.com/llvm/llvm-project/issues/70494#issuecomment-1845646490
+//
+// Hence, when the _Tp doesn't have an assignment operator, we can't risk making it a potentially-overlapping
+// subobject because of the above, and we don't use [[no_unique_address]] in that case.
+template <class _Tp>
+concept __can_use_no_unique_address = (copy_constructible<_Tp> ? copyable<_Tp> : movable<_Tp>);
+
 #  else
 
 template <class _Tp>
@@ -144,23 +158,45 @@ concept __doesnt_need_empty_state_for_move = movable<_Tp> || is_nothrow_move_con
 
 template <class _Tp>
 concept __doesnt_need_empty_state = __doesnt_need_empty_state_for_copy<_Tp> && __doesnt_need_empty_state_for_move<_Tp>;
+
+template <class _Tp>
+concept __can_use_no_unique_address = copyable<_Tp>;
 #  endif
 
+template <class _Tp>
+struct __movable_box_holder {
+  _Tp __val_;
+
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __movable_box_holder(in_place_t, _Args&&... __args)
+      : __val_(std::forward<_Args>(__args)...) {}
+};
+
+template <class _Tp>
+  requires __can_use_no_unique_address<_Tp>
+struct __movable_box_holder<_Tp> {
+  _LIBCPP_NO_UNIQUE_ADDRESS _Tp __val_;
+
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __movable_box_holder(in_place_t, _Args&&... __args)
+      : __val_(std::forward<_Args>(__args)...) {}
+};
+
 template <__movable_box_object _Tp>
   requires __doesnt_need_empty_state<_Tp>
 class __movable_box<_Tp> {
-  _LIBCPP_NO_UNIQUE_ADDRESS _Tp __val_;
+  _LIBCPP_NO_UNIQUE_ADDRESS __movable_box_holder<_Tp> __holder_;
 
 public:
   template <class... _Args>
     requires is_constructible_v<_Tp, _Args...>
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit __movable_box(in_place_t, _Args&&... __args) noexcept(
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __movable_box(in_place_t __inplace, _Args&&... __args) noexcept(
       is_nothrow_constructible_v<_Tp, _Args...>)
-      : __val_(std::forward<_Args>(__args)...) {}
+      : __holder_(__inplace, std::forward<_Args>(__args)...) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr __movable_box() noexcept(is_nothrow_default_constructible_v<_Tp>)
     requires default_initializable<_Tp>
-      : __val_() {}
+      : __holder_(in_place_t{}) {}
 
   _LIBCPP_HIDE_FROM_ABI __movable_box(__movable_box const&) = default;
   _LIBCPP_HIDE_FROM_ABI __movable_box(__movable_box&&)      = default;
@@ -176,27 +212,29 @@ class __movable_box<_Tp> {
   // Implementation of assignment operators in case we perform optimization (2)
   _LIBCPP_HIDE_FROM_ABI constexpr __movable_box& operator=(__movable_box const& __other) noexcept {
     static_assert(is_nothrow_copy_constructible_v<_Tp>);
+    static_assert(!__can_use_no_unique_address<_Tp>);
     if (this != std::addressof(__other)) {
-      std::destroy_at(std::addressof(__val_));
-      std::construct_at(std::addressof(__val_), __other.__val_);
+      std::destroy_at(std::addressof(__holder_.__val_));
+      std::construct_at(std::addressof(__holder_.__val_), __other.__holder_.__val_);
     }
     return *this;
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr __movable_box& operator=(__movable_box&& __other) noexcept {
     static_assert(is_nothrow_move_constructible_v<_Tp>);
+    static_assert(!__can_use_no_unique_address<_Tp>);
     if (this != std::addressof(__other)) {
-      std::destroy_at(std::addressof(__val_));
-      std::construct_at(std::addressof(__val_), std::move(__other.__val_));
+      std::destroy_at(std::addressof(__holder_.__val_));
+      std::construct_at(std::addressof(__holder_.__val_), std::move(__other.__holder_.__val_));
     }
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr _Tp const& operator*() const noexcept { return __val_; }
-  _LIBCPP_HIDE_FROM_ABI constexpr _Tp& operator*() noexcept { return __val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp const& operator*() const noexcept { return __holder_.__val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp& operator*() noexcept { return __holder_.__val_; }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr const _Tp* operator->() const noexcept { return std::addressof(__val_); }
-  _LIBCPP_HIDE_FROM_ABI constexpr _Tp* operator->() noexcept { return std::addressof(__val_); }
+  _LIBCPP_HIDE_FROM_ABI constexpr const _Tp* operator->() const noexcept { return std::addressof(__holder_.__val_); }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp* operator->() noexcept { return std::addressof(__holder_.__val_); }
 
   _LIBCPP_HIDE_FROM_ABI constexpr bool __has_value() const noexcept { return true; }
 };
diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h
index 459a1e229613a..479eca96acb09 100644
--- a/libcxx/include/__ranges/repeat_view.h
+++ b/libcxx/include/__ranges/repeat_view.h
@@ -68,7 +68,7 @@ struct __fn;
 template <move_constructible _Tp, semiregular _Bound = unreachable_sentinel_t>
   requires(is_object_v<_Tp> && same_as<_Tp, remove_cv_t<_Tp>> &&
            (__integer_like_with_usable_difference_type<_Bound> || same_as<_Bound, unreachable_sentinel_t>))
-class repeat_view : public view_interface<repeat_view<_Tp, _Bound>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG repeat_view : public view_interface<repeat_view<_Tp, _Bound>> {
   friend struct views::__take::__fn;
   friend struct views::__drop::__fn;
   class __iterator;
@@ -119,7 +119,7 @@ class repeat_view : public view_interface<repeat_view<_Tp, _Bound>> {
   }
 
 private:
-  __movable_box<_Tp> __value_;
+  _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Tp> __value_;
   _LIBCPP_NO_UNIQUE_ADDRESS _Bound __bound_ = _Bound();
 };
 
diff --git a/libcxx/include/__ranges/single_view.h b/libcxx/include/__ranges/single_view.h
index b0b2c1d9f3c06..0ae2036a66a92 100644
--- a/libcxx/include/__ranges/single_view.h
+++ b/libcxx/include/__ranges/single_view.h
@@ -37,8 +37,8 @@ template <move_constructible _Tp>
 template <copy_constructible _Tp>
 #  endif
   requires is_object_v<_Tp>
-class single_view : public view_interface<single_view<_Tp>> {
-  __movable_box<_Tp> __value_;
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG single_view : public view_interface<single_view<_Tp>> {
+  _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Tp> __value_;
 
 public:
   _LIBCPP_HIDE_FROM_ABI single_view()
diff --git a/libcxx/include/__ranges/take_while_view.h b/libcxx/include/__ranges/take_while_view.h
index a6f7f80ca76bf..4534210d97943 100644
--- a/libcxx/include/__ranges/take_while_view.h
+++ b/libcxx/include/__ranges/take_while_view.h
@@ -43,7 +43,8 @@ namespace ranges {
 
 template <view _View, class _Pred>
   requires input_range<_View> && is_object_v<_Pred> && indirect_unary_predicate<const _Pred, iterator_t<_View>>
-class take_while_view : public view_interface<take_while_view<_View, _Pred>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG take_while_view
+    : public view_interface<take_while_view<_View, _Pred>> {
   template <bool>
   class __sentinel;
 
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index 55c6ce587bd69..744f597ccef59 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -67,7 +67,8 @@ template <input_range _View, move_constructible _Fn>
 template <input_range _View, copy_constructible _Fn>
 #  endif
   requires __transform_view_constraints<_View, _Fn>
-class transform_view : public view_interface<transform_view<_View, _Fn>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG transform_view
+    : public view_interface<transform_view<_View, _Fn>> {
   template <bool>
   class __iterator;
   template <bool>
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/no_unique_address.compile.pass.cpp
new file mode 100644
index 0000000000000..e696598f618ba
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/no_unique_address.compile.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// XFAIL: msvc
+
+// This test ensures that we use `[[no_unique_address]]` in `chunk_by_view`.
+
+#include <ranges>
+
+struct View : std::ranges::view_base {
+  int* begin() const;
+  int* end() const;
+};
+
+struct Pred {
+  template <class... Args>
+  bool operator()(const Args&...) const;
+};
+
+template <class View>
+struct Test {
+  [[no_unique_address]] View view;
+  char c;
+};
+
+// [[no_unique_address]] applied to _View
+struct ViewWithPadding : View {
+  alignas(128) char c;
+};
+
+static_assert(sizeof(Test<std::ranges::chunk_by_view<ViewWithPadding, Pred>>) ==
+              sizeof(std::ranges::chunk_by_view<ViewWithPadding, Pred>));
+
+// [[no_unique_address]] applied to movable-box
+struct PredWithPadding : Pred {
+  alignas(128) char c;
+};
+
+static_assert(sizeof(Test<std::ranges::chunk_by_view<View, PredWithPadding>>) ==
+              sizeof(std::ranges::chunk_by_view<View, PredWithPadding>));
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/empty_object.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/empty_object.pass.cpp
new file mode 100644
index 0000000000000..9712635a0010b
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/empty_object.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// This test ensures that <movable-box> behaves correctly when it holds an empty type.
+
+#include <ranges>
+
+#include <cassert>
+#include <utility>
+
+bool copied = false;
+bool moved  = false;
+
+struct Empty {
+  Empty() noexcept {}
+  Empty(Empty const&) noexcept { copied = true; }
+  Empty(Empty&&) noexcept { moved = true; }
+  Empty& operator=(Empty const&) = delete;
+  Empty& operator=(Empty&&)      = delete;
+};
+
+using Box = std::ranges::__movable_box<Empty>;
+
+struct Inherit : Box {};
+
+struct Hold : Box {
+  [[no_unique_address]] Inherit member;
+};
+
+int main(int, char**) {
+  Hold box;
+
+  Box& base   = static_cast<Box&>(box);
+  Box& member = static_cast<Box&>(box.member);
+
+  // Despite [[no_unique_address]], the two objects have the same type so they
+  // can't share the same address.
+  assert(&base != &member);
+
+  // Make sure that we do perform the copy-construction, which wouldn't be the
+  // case if the two <movable-box>s had the same address.
+  base = member;
+  assert(copied);
+
+  base = std::move(member);
+  assert(moved);
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/no_unique_address.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/no_unique_address.pass.cpp
index e21191f98dea3..4ec6c888f9c1c 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/no_unique_address.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/no_unique_address.pass.cpp
@@ -8,49 +8,71 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// This test ensures that <copyable-box> behaves correctly when it holds an empty type.
+// clang-cl and cl currently don't support [[no_unique_address]]
+// XFAIL: msvc
 
 #include <ranges>
 
 #include <cassert>
 #include <utility>
 
-bool copied = false;
-bool moved  = false;
+#include "test_macros.h"
 
-struct Empty {
-  Empty() noexcept {}
-  Empty(Empty const&) noexcept { copied = true; }
-  Empty(Empty&&) noexcept { moved = true; }
-  Empty& operator=(Empty const&) = delete;
-  Empty& operator=(Empty&&)      = delete;
-};
+template <class T, bool ExpectNoUniqueAddress>
+void test_no_unique_address() {
+  struct Test {
+    [[no_unique_address]] std::ranges::__movable_box<T> box_;
+    bool b2;
+  };
 
-using Box = std::ranges::__movable_box<Empty>;
+  if constexpr (ExpectNoUniqueAddress) {
+    static_assert(sizeof(Test) == sizeof(bool));
+  } else {
+    static_assert(sizeof(Test) > sizeof(bool));
+  }
+}
 
-struct Inherit : Box {};
+struct Copyable {};
 
-struct Hold : Box {
-  [[no_unique_address]] Inherit member;
+struct NotCopyAssignable {
+  constexpr NotCopyAssignable()                          = default;
+  constexpr NotCopyAssignable(const NotCopyAssignable&)  = default;
+  NotCopyAssignable& operator=(const NotCopyAssignable&) = delete;
 };
 
-int main(int, char**) {
-  Hold box;
+struct NotMoveAssignable {
+  constexpr NotMoveAssignable()                          = default;
+  constexpr NotMoveAssignable(const NotMoveAssignable&)  = default;
+  NotMoveAssignable& operator=(const NotMoveAssignable&) = default;
+  constexpr NotMoveAssignable(NotMoveAssignable&&)       = default;
+  NotMoveAssignable& operator=(NotMoveAssignable&&)      = delete;
+};
 
-  Box& base   = static_cast<Box&>(box);
-  Box& member = static_cast<Box&>(box.member);
+struct MoveOnly {
+  constexpr MoveOnly()                 = default;
+  constexpr MoveOnly(const MoveOnly&)  = delete;
+  MoveOnly& operator=(const MoveOnly&) = delete;
+  constexpr MoveOnly(MoveOnly&&)       = default;
+  MoveOnly& operator=(MoveOnly&&)      = default;
+};
 
-  // Despite [[no_unique_address]], the two objects have the same type so they
-  // can't share the same address.
-  assert(&base != &member);
+struct MoveOnlyNotAssignable {
+  constexpr MoveOnlyNotAssignable()                              = default;
+  constexpr MoveOnlyNotAssignable(const MoveOnlyNotAssignable&)  = delete;
+  MoveOnlyNotAssignable& operator=(const MoveOnlyNotAssignable&) = delete;
+  constexpr MoveOnlyNotAssignable(MoveOnlyNotAssignable&&)       = default;
+  MoveOnlyNotAssignable& operator=(MoveOnlyNotAssignable&&)      = delete;
+};
 
-  // Make sure that we do perform the copy-construction, which wouldn't be the
-  // case if the two <copyable-box>s had the same address.
-  base = member;
-  assert(copied);
+int main(int, char**) {
+  test_no_unique_address<Copyable, true>();
+  test_no_unique_address<NotCopyAssignable, false>();
+  test_no_unique_address<NotMoveAssignable, false>();
 
-  base = std::move(member);
-  assert(moved);
+#if TEST_STD_VER >= 23
+  test_no_unique_address<MoveOnly, true>();
+  test_no_unique_address<MoveOnlyNotAssignable, false>();
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/ranges/range.factories/range.repeat.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.factories/range.repeat.view/no_unique_address.compile.pass.cpp
new file mode 100644
index 0000000000000..6ab2ee173ca1b
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.factories/range.repeat.view/no_unique_address.compile.pass.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// XFAIL: msvc
+
+// This test ensures that we use `[[no_unique_address]]` in `repeat_view`.
+
+#include <ranges>
+
+struct Empty {};
+
+struct Test {
+  [[no_unique_address]] std::ranges::repeat_view<Empty> v;
+  bool b;
+};
+
+static_assert(sizeof(Test) == sizeof(bool));
diff --git a/libcxx/test/libcxx/ranges/range.factories/range.single.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.factories/range.single.view/no_unique_address.compile.pass.cpp
new file mode 100644
index 0000000000000..10883b8385780
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.factories/range.single.view/no_unique_address.compile.pass.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: msvc
+
+// This test ensures that we use `[[no_unique_address]]` in `single_view`.
+
+#include <ranges>
+
+struct Empty {};
+
+struct Test {
+  [[no_unique_address]] std::ranges::single_view<Empty> v;
+  bool b;
+};
+
+static_assert(sizeof(Test) == sizeof(bool));
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
index 91df304b79af7..8eeaa3dae36db 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
@@ -22,22 +22,21 @@
 #include <utility>
 
 #include "test_convertible.h"
+#include "test_macros.h"
 #include "types.h"
 
 struct ElementWithCounting {
   int* times_copied = nullptr;
-  int* times_moved = nullptr;
+  int* times_moved  = nullptr;
 
   constexpr ElementWithCounting(int& copies_ctr, int& moves_ctr) : times_copied(&copies_ctr), times_moved(&moves_ctr) {}
 
   constexpr ElementWithCounting(const ElementWithCounting& rhs)
-      : times_copied(rhs.times_copied)
-      , times_moved(rhs.times_moved) {
+      : times_copied(rhs.times_copied), times_moved(rhs.times_moved) {
     ++(*times_copied);
   }
   constexpr ElementWithCounting(ElementWithCounting&& rhs)
-      : times_copied(rhs.times_copied)
-      , times_moved(rhs.times_moved) {
+      : times_copied(rhs.times_copied), times_moved(rhs.times_moved) {
     ++(*times_moved);
   }
 
@@ -48,18 +47,15 @@ struct RangeWithCounting {
   using value_type = ElementWithCounting;
 
   int* times_copied = nullptr;
-  int* times_moved = nullptr;
+  int* times_moved  = nullptr;
 
   constexpr RangeWithCounting(int& copies_ctr, int& moves_ctr) : times_copied(&copies_ctr), times_moved(&moves_ctr) {}
 
   constexpr RangeWithCounting(const RangeWithCounting& rhs)
-    : times_copied(rhs.times_copied)
-    , times_moved(rhs.times_moved) {
+      : times_copied(rhs.times_copied), times_moved(rhs.times_moved) {
     ++(*times_copied);
   }
-  constexpr RangeWithCounting(RangeWithCounting&& rhs)
-    : times_copied(rhs.times_copied)
-    , times_moved(rhs.times_moved) {
+  constexpr RangeWithCounting(RangeWithCounting&& rhs) : times_copied(rhs.times_copied), times_moved(rhs.times_moved) {
     ++(*times_moved);
   }
 
@@ -67,10 +63,10 @@ struct RangeWithCounting {
   constexpr const ElementWithCounting* end() const { return nullptr; }
 
   constexpr RangeWithCounting& operator=(const RangeWithCounting&) = default;
-  constexpr RangeWithCounting& operator=(RangeWithCounting&&) = default;
+  constexpr RangeWithCounting& operator=(RangeWithCounting&&)      = default;
   constexpr bool operator==(const RangeWithCounting&) const { return true; }
 };
-static_assert( std::ranges::forward_range<RangeWithCounting>);
+static_assert(std::ranges::forward_range<RangeWithCounting>);
 static_assert(!std::ranges::view<RangeWithCounting>);
 
 struct StrView : std::ranges::view_base {
@@ -86,9 +82,9 @@ struct StrView : std::ranges::view_base {
   constexpr std::string_view::const_iterator end() const { return buffer_.end(); }
   constexpr bool operator==(const StrView& rhs) const { return buffer_ == rhs.buffer_; }
 };
-static_assert( std::ranges::random_access_range<StrView>);
-static_assert( std::ranges::view<StrView>);
-static_assert( std::is_copy_constructible_v<StrView>);
+static_assert(std::ranges::random_access_range<StrView>);
+static_assert(std::ranges::view<StrView>);
+static_assert(std::is_copy_constructible_v<StrView>);
 
 // SFINAE tests.
 
@@ -131,7 +127,7 @@ constexpr bool test() {
 
   // Make sure the arguments are moved, not copied.
   {
-    using Range = RangeWithCounting;
+    using Range   = RangeWithCounting;
     using Element = ElementWithCounting;
     using Pattern = std::ranges::single_view<Element>;
 
@@ -144,10 +140,13 @@ constexpr bool test() {
       Element element(element_copied, element_moved);
 
       std::ranges::lazy_split_view<View, Pattern> v(range, element);
-      assert(range_copied == 0); // `ref_view` does neither copy...
-      assert(range_moved == 0); // ...nor move the element.
+      assert(range_copied == 0);   // `ref_view` does neither copy...
+      assert(range_moved == 0);    // ...nor move the element.
       assert(element_copied == 1); // The element is copied into the argument...
+#ifndef TEST_COMPILER_GCC
+      //https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995
       assert(element_moved == 1); // ...and moved into the member variable.
+#endif
     }
 
     // Arguments are rvalues.
@@ -160,7 +159,10 @@ constexpr bool test() {
       assert(range_copied == 0);
       assert(range_moved == 1); // `owning_view` moves the given argument.
       assert(element_copied == 0);
+#ifndef TEST_COMPILER_GCC
+      //https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995
       assert(element_moved == 1);
+#endif
     }
   }
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
index 32fed6f8caa7d..cfe394e891832 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
@@ -108,5 +108,22 @@ int main(int, char**) {
   }
 #endif
 
+  // GH issue #70506
+  // movable_box::operator= overwrites underlying view
+  {
+    auto f = [l = 0.0L, b = false](int i) {
+      (void)l;
+      (void)b;
+      return i;
+    };
+
+    auto v1 = std::vector{1, 2, 3, 4} | std::views::transform(f);
+    auto v2 = std::vector{1, 2, 3, 4} | std::views::transform(f);
+
+    v1             = std::move(v2);
+    int expected[] = {1, 2, 3, 4};
+    assert(std::equal(v1.begin(), v1.end(), expected, expected + 4));
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp b/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp
new file mode 100644
index 0000000000000..3b35271d649c3
--- /dev/null
+++ b/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Test that views that use __movable_box do not overwrite overlapping subobjects.
+// https://github.com/llvm/llvm-project/issues/70506
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+
+struct Pred {
+  alignas(128) bool a{};
+
+  Pred() noexcept            = default;
+  Pred(const Pred&) noexcept = default;
+  Pred(Pred&&) noexcept      = default;
+
+  Pred& operator=(const Pred&) = delete;
+  Pred& operator=(Pred&&)      = delete;
+
+  constexpr bool operator()(const auto&...) const { return true; }
+};
+
+struct View : std::ranges::view_base {
+  constexpr int* begin() const { return nullptr; }
+  constexpr int* end() const { return nullptr; }
+};
+
+template <class View>
+struct S {
+  [[no_unique_address]] View view{};
+  char c = 42;
+};
+
+template <class View>
+constexpr void testOne() {
+  S<View> s1;
+  assert(s1.c == 42);
+  s1.view = View{};
+  assert(s1.c == 42);
+}
+
+constexpr bool test() {
+  testOne<std::ranges::transform_view<View, Pred>>();
+  testOne<std::ranges::filter_view<View, Pred>>();
+  testOne<std::ranges::drop_while_view<View, Pred>>();
+  testOne<std::ranges::take_while_view<View, Pred>>();
+  testOne<std::ranges::single_view<Pred>>();
+
+#if TEST_STD_VER >= 23
+  testOne<std::ranges::chunk_by_view<View, Pred>>();
+  testOne<std::ranges::repeat_view<Pred>>();
+#endif
+  return true;
+}
+
+int main(int, char**) {
+  static_assert(test());
+  test();
+
+  return 0;
+}

From b51f8f13edf3f7ab6407d2b7b46285ea675730b6 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Tue, 2 Jan 2024 08:06:44 +0100
Subject: [PATCH 004/313] [libc++][test] Removes Clang < 14 support. (#76658)

---
 .../rand.dist.uni/rand.dist.uni.int/eval.pass.cpp         | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.int/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.int/eval.pass.cpp
index 669f90d5292df..3a2942b74596a 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.int/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.int/eval.pass.cpp
@@ -26,12 +26,6 @@
 
 #include "test_macros.h"
 
-// The __int128 conversions to/from floating point crash on MinGW on x86_64.
-// This is fixed in Clang 14 by https://reviews.llvm.org/D110413.
-#if defined(__x86_64__) && defined(__MINGW32__) && defined(__clang_major__) && __clang_major__ < 14
- #define TEST_BUGGY_I128_FP
-#endif
-
 template <class T>
 T sqr(T x) {
     return x * x;
@@ -127,7 +121,7 @@ int main(int, char**)
     test_statistics<std::int8_t, std::minstd_rand0>();
     test_statistics<std::uint8_t, std::minstd_rand0>();
 
-#if !defined(TEST_HAS_NO_INT128) && !defined(TEST_BUGGY_I128_FP)
+#if !defined(TEST_HAS_NO_INT128)
     test_statistics<__int128_t, std::minstd_rand0>();
     test_statistics<__uint128_t, std::minstd_rand0>();
 

From 4c2ad82b9dff537f7c8f0a2abb0d5dc7e6435741 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>
Date: Tue, 2 Jan 2024 13:42:56 +0530
Subject: [PATCH 005/313] [AMDGPU] Do not preserve UniformityInfo (#76696)

UniformityInfo has a transitive dependence on CycleInfo. A transform may
change the CFG in trivial ways that do not affect uniformity, but that
can leave cycles in a slightly inconsistent state. In the absence of
updates to CycleInfo, it's cleaner to just invalidate both analyses.
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h            | 6 ++++++
 llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp      | 1 -
 llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 3 ---
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index b7d0a1228ebfc..d397b937d78cc 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -33,6 +33,12 @@
 /// the propagation of the impact of divergent control flow on the divergence of
 /// values (sync dependencies).
 ///
+/// NOTE: In general, no interface exists for a transform to update
+/// (Machine)UniformityInfo. Additionally, (Machine)CycleAnalysis is a
+/// transitive dependence, but it also does not provide an interface for
+/// updating itself. Given that, transforms should not preserve uniformity in
+/// their getAnalysisUsage() callback.
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ADT_GENERICUNIFORMITYIMPL_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
index 459400e3359ca..79e9312034da4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
@@ -85,7 +85,6 @@ class AMDGPURewriteUndefForPHILegacy : public FunctionPass {
     AU.addRequired<DominatorTreeWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<UniformityInfoWrapperPass>();
     AU.setPreservesCFG();
   }
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 9bc3ba161c9eb..1bfb7c0edd80a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -109,9 +109,6 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const {
     // FIXME: preserve PostDominatorTreeWrapperPass
   }
 
-  // No divergent values are changed, only blocks and branch edges.
-  AU.addPreserved<UniformityInfoWrapperPass>();
-
   // We preserve the non-critical-edgeness property
   AU.addPreservedID(BreakCriticalEdgesID);
 

From 5c458ed490a01dcc82f9d063732cac4207786fd5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 2 Jan 2024 09:30:50 +0100
Subject: [PATCH 006/313] [gold] Fix tests after #76553 (NFC)

---
 llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll     | 2 +-
 llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll             | 2 +-
 llvm/test/tools/gold/X86/opt-level.ll                           | 2 +-
 .../tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll b/llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll
index 3d2badc18674a..1976f8f539fd6 100644
--- a/llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll
+++ b/llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll
@@ -116,7 +116,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll b/llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll
index bb94cd3e1889b..4e84a21ebfdfe 100644
--- a/llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll
+++ b/llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll
@@ -74,7 +74,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/tools/gold/X86/opt-level.ll b/llvm/test/tools/gold/X86/opt-level.ll
index 1ec2ea5e66090..65304c03f7048 100644
--- a/llvm/test/tools/gold/X86/opt-level.ll
+++ b/llvm/test/tools/gold/X86/opt-level.ll
@@ -24,7 +24,7 @@ define internal void @foo() {
 }
 
 ; CHECK-O0: define internal i32 @bar(
-; CHECK-O1: define internal i32 @bar(
+; CHECK-O1: define internal noundef i32 @bar(
 define internal i32 @bar(i1 %p) {
   br i1 %p, label %t, label %f
 
diff --git a/llvm/test/tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll b/llvm/test/tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll
index f927af61d56f9..82880a2483fa2 100644
--- a/llvm/test/tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll
+++ b/llvm/test/tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll
@@ -114,7 +114,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj

From 75be7bb3fc6d28a7a97a0ca5c3231066b11bceba Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 2 Jan 2024 09:50:27 +0100
Subject: [PATCH 007/313] [flang][OpenMP][Offloading][AMDGPU] Add test for
 `target update` (#76355)

Adds a new test for offloading `target update` directive to AMD GPUs.
---
 .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp  | 11 ++--
 .../test/offloading/fortran/target_update.f90 | 50 +++++++++++++++++++
 2 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/fortran/target_update.f90

diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index cd1cfb3b7686d..730858ffc67a7 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -239,11 +239,11 @@ void mlir::configureOpenMPToLLVMConversionLegality(
   target.addDynamicallyLegalOp<
       mlir::omp::AtomicReadOp, mlir::omp::AtomicWriteOp, mlir::omp::FlushOp,
       mlir::omp::ThreadprivateOp, mlir::omp::YieldOp, mlir::omp::EnterDataOp,
-      mlir::omp::ExitDataOp, mlir::omp::DataBoundsOp, mlir::omp::MapInfoOp>(
-      [&](Operation *op) {
-        return typeConverter.isLegal(op->getOperandTypes()) &&
-               typeConverter.isLegal(op->getResultTypes());
-      });
+      mlir::omp::ExitDataOp, mlir::omp::UpdateDataOp, mlir::omp::DataBoundsOp,
+      mlir::omp::MapInfoOp>([&](Operation *op) {
+    return typeConverter.isLegal(op->getOperandTypes()) &&
+           typeConverter.isLegal(op->getResultTypes());
+  });
   target.addDynamicallyLegalOp<mlir::omp::ReductionOp>([&](Operation *op) {
     return typeConverter.isLegal(op->getOperandTypes());
   });
@@ -282,6 +282,7 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
       RegionLessOpConversion<omp::YieldOp>,
       RegionLessOpConversion<omp::EnterDataOp>,
       RegionLessOpConversion<omp::ExitDataOp>,
+      RegionLessOpConversion<omp::UpdateDataOp>,
       RegionLessOpWithVarOperandsConversion<omp::DataBoundsOp>>(converter);
 }
 
diff --git a/openmp/libomptarget/test/offloading/fortran/target_update.f90 b/openmp/libomptarget/test/offloading/fortran/target_update.f90
new file mode 100644
index 0000000000000..fb35c5a36ab0e
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target_update.f90
@@ -0,0 +1,50 @@
+! Offloading test for the `target update` directive.
+
+! REQUIRES: flang, amdgcn-amd-amdhsa
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program target_update
+    implicit none
+    integer :: x(1)
+    integer :: host_id
+    integer :: device_id(1)
+
+    INTERFACE
+        FUNCTION omp_get_device_num() BIND(C)
+            USE, INTRINSIC :: iso_c_binding, ONLY: C_INT
+            integer :: omp_get_device_num
+        END FUNCTION omp_get_device_num
+    END INTERFACE
+
+    x(1) = 5
+    host_id = omp_get_device_num()
+
+!$omp target enter data map(to:x, device_id)
+!$omp target
+    x(1) = 42
+!$omp end target
+
+    ! Test that without a `target update` directive, the target update to x is
+    ! not yet seen by the host.
+    ! CHECK: After first target regions and before target update: x = 5
+    print *, "After first target regions and before target update: x =", x(1)
+
+!$omp target
+    x(1) = 84
+    device_id(1) = omp_get_device_num()
+!$omp end target
+!$omp target update from(x, device_id)
+
+    ! Test that after the `target update`, the host can see the new x value.
+    ! CHECK: After second target regions and target update: x = 84
+    print *, "After second target regions and target update: x =", x(1)
+
+    ! Make sure that offloading to the device actually happened. This way we
+    ! verify that we didn't take the fallback host execution path.
+    ! CHECK: Offloading succeeded!
+    if (host_id /= device_id(1)) then
+        print *, "Offloading succeeded!"
+    else
+        print *, "Offloading failed!"
+    end if
+end program target_update

From b238a0d989bd8047d9b9ce48ad401c13d981e187 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 2 Jan 2024 08:51:37 +0000
Subject: [PATCH 008/313] [mlir] Apply ClangTidy findings.

- Remove redundant return
- Use .empty() instead of size() == 0.
---
 mlir/lib/Analysis/Presburger/Matrix.cpp          | 1 -
 mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Analysis/Presburger/Matrix.cpp b/mlir/lib/Analysis/Presburger/Matrix.cpp
index 1f1188c115a8b..b68a7b7004bba 100644
--- a/mlir/lib/Analysis/Presburger/Matrix.cpp
+++ b/mlir/lib/Analysis/Presburger/Matrix.cpp
@@ -645,5 +645,4 @@ void FracMatrix::LLL(Fraction delta) {
       k = k > 1 ? k - 1 : 1;
     }
   }
-  return;
 }
diff --git a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
index 3ae4fb726215f..feed683a203cf 100644
--- a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
+++ b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
@@ -23,7 +23,7 @@ QuasiPolynomial::QuasiPolynomial(
 #ifndef NDEBUG
   // For each term which involves at least one affine function,
   for (const std::vector<SmallVector<Fraction>> &term : affine) {
-    if (term.size() == 0)
+    if (term.empty())
       continue;
     // the number of elements in each affine function is
     // one more than the number of symbols.
@@ -112,4 +112,4 @@ QuasiPolynomial QuasiPolynomial::simplify() {
     newAffine.push_back(affine[i]);
   }
   return QuasiPolynomial(getNumInputs(), newCoeffs, newAffine);
-}
\ No newline at end of file
+}

From baf8a39aaf8b61a38b5b2b5591deb765e42eb00b Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 2 Jan 2024 08:55:37 +0000
Subject: [PATCH 009/313] [mlir] Apply ClangTidy fix.

Prefer to use .empty() instead of checking size().
---
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 45e0632db5ef2..f484eda3268db 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -628,7 +628,7 @@ template <typename Op>
 static LogicalResult verifyDeviceTypeCountMatch(Op op, OperandRange operands,
                                                 ArrayAttr deviceTypes,
                                                 llvm::StringRef keyword) {
-  if (operands.size() > 0 && deviceTypes.getValue().size() != operands.size())
+  if (!operands.empty() && deviceTypes.getValue().size() != operands.size())
     return op.emitOpError() << keyword << " operands count must match "
                             << keyword << " device_type count";
   return success();

From d714be978cf48bc85cb7eacf57c3548c0606a5e4 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 2 Jan 2024 09:24:08 +0000
Subject: [PATCH 010/313] [AArch64] Check for exact size when finding 1's for
 CMLE. (#76452)

This is a fix for the second half of #75822, where smaller constants can
also be bitcast to larger types. We should be checking the size is what
we expect it to be when matching ones.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp        | 9 ++++-----
 llvm/test/CodeGen/AArch64/neon-compare-instructions.ll | 3 ++-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dffe69bdb900d..50658a855cfb3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13715,11 +13715,10 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
   bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
                                             SplatBitSize, HasAnyUndefs);
 
-  bool IsSplatUniform =
-      SrcVT.getVectorElementType().getSizeInBits() >= SplatBitSize;
-  bool IsZero = IsCnst && SplatValue == 0 && IsSplatUniform;
-  bool IsOne = IsCnst && SplatValue == 1 && IsSplatUniform;
-  bool IsMinusOne = IsCnst && SplatValue.isAllOnes() && IsSplatUniform;
+  bool IsZero = IsCnst && SplatValue == 0;
+  bool IsOne =
+      IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
+  bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
 
   if (SrcVT.getVectorElementType().isFloatingPoint()) {
     switch (CC) {
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index af839d3b60836..765c81e26e13c 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1792,7 +1792,8 @@ define <8 x i1> @not_cmle8xi8(<8 x i8> %0) {
 define <4 x i1> @not_cmle16xi8(<4 x i32> %0) {
 ; CHECK-SD-LABEL: not_cmle16xi8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    cmle v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    movi v1.8h, #1
+; CHECK-SD-NEXT:    cmgt v0.4s, v1.4s, v0.4s
 ; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;

From 5055eeea5205d938320590236eeb782c92e40911 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 2 Jan 2024 09:43:30 +0000
Subject: [PATCH 011/313] [Clang][AArch64] Add missing SME functions to header
 file. (#75791)

This includes:
* __arm_in_streaming_mode()
* __arm_has_sme()
* __arm_za_disable()
* __svundef_za()
---
 clang/include/clang/Basic/BuiltinsAArch64.def |  3 +
 clang/lib/CodeGen/CGBuiltin.cpp               | 20 ++++++
 .../acle_sme_state_funs.c                     | 72 +++++++++++++++++++
 .../acle_sve2p1_qcvtn.c                       | 22 +++---
 clang/utils/TableGen/SveEmitter.cpp           | 19 +++++
 5 files changed, 127 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index 82a1ba3c82ad3..31ec84143f65c 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -68,6 +68,9 @@ TARGET_BUILTIN(__builtin_arm_ldg, "v*v*", "t", "mte")
 TARGET_BUILTIN(__builtin_arm_stg, "vv*", "t", "mte")
 TARGET_BUILTIN(__builtin_arm_subp, "Uiv*v*", "t", "mte")
 
+// SME state function
+BUILTIN(__builtin_arm_get_sme_state, "vULi*ULi*", "n")
+
 // Memory Operations
 TARGET_BUILTIN(__builtin_arm_mops_memset_tag, "v*v*iz", "", "mte,mops")
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5081062da2862..f71dbf1729a1d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10430,6 +10430,26 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
   }
 
+  if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
+    // Create call to __arm_sme_state and store the results to the two pointers.
+    CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {},
+                                false),
+        "__arm_sme_state"));
+    auto Attrs =
+        AttributeList()
+            .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible")
+            .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved");
+    CI->setAttributes(Attrs);
+    CI->setCallingConv(
+        llvm::CallingConv::
+            AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2);
+    Builder.CreateStore(Builder.CreateExtractValue(CI, 0),
+                        EmitPointerWithAlignment(E->getArg(0)));
+    return Builder.CreateStore(Builder.CreateExtractValue(CI, 1),
+                               EmitPointerWithAlignment(E->getArg(1)));
+  }
+
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
     assert((getContext().getTypeSize(E->getType()) == 32) &&
            "rbit of unusual size!");
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
new file mode 100644
index 0000000000000..282819c8ca350
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
@@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+// CHECK-LABEL: @test_in_streaming_mode(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
+// CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
+// CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+// CPP-CHECK-LABEL: @_Z22test_in_streaming_modev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
+// CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+bool test_in_streaming_mode(void) __arm_streaming_compatible {
+  return __arm_in_streaming_mode();
+}
+
+// CHECK-LABEL: @test_za_disable(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_za_disablev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]]
+// CPP-CHECK-NEXT:    ret void
+//
+void test_za_disable(void) __arm_streaming_compatible {
+  __arm_za_disable();
+}
+
+// CHECK-LABEL: @test_has_sme(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
+// CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+// CPP-CHECK-LABEL: @_Z12test_has_smev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
+// CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+bool test_has_sme(void) __arm_streaming_compatible {
+  return __arm_has_sme();
+}
+
+// CHECK-LABEL: @test_svundef_za(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svundef_zav(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svundef_za(void) __arm_streaming_compatible __arm_shared_za {
+  svundef_za();
+}
+
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
index be7ec719edea5..fb53ea456c816 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
@@ -8,13 +8,11 @@
 // RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
@@ -23,6 +21,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_qcvtn_s16_s32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
@@ -37,7 +41,7 @@
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvtn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
-svint16_t test_qcvtn_s16_s32_x2(svint32x2_t zn) __arm_streaming_compatible {
+svint16_t test_qcvtn_s16_s32_x2(svint32x2_t zn) ATTR {
   return SVE_ACLE_FUNC(svqcvtn_s16,_s32_x2,,)(zn);
 }
 
@@ -55,7 +59,7 @@ svint16_t test_qcvtn_s16_s32_x2(svint32x2_t zn) __arm_streaming_compatible {
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uqcvtn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
-svuint16_t test_qcvtn_u16_u32_x2(svuint32x2_t zn) __arm_streaming_compatible {
+svuint16_t test_qcvtn_u16_u32_x2(svuint32x2_t zn) ATTR {
   return SVE_ACLE_FUNC(svqcvtn_u16,_u32_x2,,)(zn);
 }
 
@@ -73,6 +77,6 @@ svuint16_t test_qcvtn_u16_u32_x2(svuint32x2_t zn) __arm_streaming_compatible {
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvtun.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
-svuint16_t test_qcvtn_u16_s32_x2(svint32x2_t zn) __arm_streaming_compatible {
+svuint16_t test_qcvtn_u16_s32_x2(svint32x2_t zn) ATTR {
   return SVE_ACLE_FUNC(svqcvtn_u16,_s32_x2,,)(zn);
 }
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 311c6b09dc790..6c302da106a2c 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1603,6 +1603,25 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "extern \"C\" {\n";
   OS << "#endif\n\n";
 
+  OS << "void __arm_za_disable(void) __arm_streaming_compatible;\n\n";
+
+  OS << "__ai bool __arm_has_sme(void) __arm_streaming_compatible {\n";
+  OS << "  uint64_t x0, x1;\n";
+  OS << "  __builtin_arm_get_sme_state(&x0, &x1);\n";
+  OS << "  return x0 & (1ULL << 63);\n";
+  OS << "}\n\n";
+
+  OS << "__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible "
+        "{\n";
+  OS << "  uint64_t x0, x1;\n";
+  OS << "  __builtin_arm_get_sme_state(&x0, &x1);\n";
+  OS << "  return x0 & 1;\n";
+  OS << "}\n\n";
+
+  OS << "__ai __attribute__((target(\"sme\"))) void svundef_za(void) "
+        "__arm_streaming_compatible __arm_shared_za "
+        "{ }\n\n";
+
   createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME);
 
   OS << "#ifdef __cplusplus\n";

From 734ee0e01feeadd75bdbed35acc08f242623a212 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 2 Jan 2024 10:49:45 +0100
Subject: [PATCH 012/313] [LVI] Support using block values when handling
 conditions (#75311)

Currently, LVI will only use conditions like "X < C" to constrain the
value of X on the relevant edge. This patch extends it to handle
conditions like "X < Y" by querying the known range of Y.

This means that getValueFromCondition() and various related APIs can now
return nullopt to indicate that they have pushed to the worklist, and
need to be called again later. This behavior is currently controlled by
a UseBlockValue option, and only enabled for actual edge value handling.
All other places deriving constraints from conditions keep using the
previous logic for now.

This change was originally motivated as a fix for the regression
reported in
https://github.com/llvm/llvm-project/pull/73662#issuecomment-1849281758.
Unfortunately, it doesn't actually fix it, because we run into another
issue there (LVI currently is really bad at handling values used in
loops).

This change has some compile-time impact, but it's fairly small,
in the 0.05% range.
---
 llvm/lib/Analysis/LazyValueInfo.cpp           | 155 ++++++++++++------
 .../cond-using-block-value.ll                 |   8 +-
 2 files changed, 110 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 89cc7ea15ec1d..f7d8771648227 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -434,6 +434,28 @@ class LazyValueInfoImpl {
 
   void solve();
 
+  // For the following methods, if UseBlockValue is true, the function may
+  // push additional values to the worklist and return nullopt. If
+  // UseBlockValue is false, it will never return nullopt.
+
+  std::optional<ValueLatticeElement>
+  getValueFromSimpleICmpCondition(CmpInst::Predicate Pred, Value *RHS,
+                                  const APInt &Offset, Instruction *CxtI,
+                                  bool UseBlockValue);
+
+  std::optional<ValueLatticeElement>
+  getValueFromICmpCondition(Value *Val, ICmpInst *ICI, bool isTrueDest,
+                            bool UseBlockValue);
+
+  std::optional<ValueLatticeElement>
+  getValueFromCondition(Value *Val, Value *Cond, bool IsTrueDest,
+                        bool UseBlockValue, unsigned Depth = 0);
+
+  std::optional<ValueLatticeElement> getEdgeValueLocal(Value *Val,
+                                                       BasicBlock *BBFrom,
+                                                       BasicBlock *BBTo,
+                                                       bool UseBlockValue);
+
 public:
   /// This is the query interface to determine the lattice value for the
   /// specified Value* at the context instruction (if specified) or at the
@@ -755,14 +777,10 @@ LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) {
   return Result;
 }
 
-static ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond,
-                                                 bool isTrueDest = true,
-                                                 unsigned Depth = 0);
-
 // If we can determine a constraint on the value given conditions assumed by
 // the program, intersect those constraints with BBLV
 void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
-        Value *Val, ValueLatticeElement &BBLV, Instruction *BBI) {
+    Value *Val, ValueLatticeElement &BBLV, Instruction *BBI) {
   BBI = BBI ? BBI : dyn_cast<Instruction>(Val);
   if (!BBI)
     return;
@@ -779,17 +797,21 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
     if (I->getParent() != BB || !isValidAssumeForContext(I, BBI))
       continue;
 
-    BBLV = intersect(BBLV, getValueFromCondition(Val, I->getArgOperand(0)));
+    BBLV = intersect(BBLV, *getValueFromCondition(Val, I->getArgOperand(0),
+                                                  /*IsTrueDest*/ true,
+                                                  /*UseBlockValue*/ false));
   }
 
   // If guards are not used in the module, don't spend time looking for them
   if (GuardDecl && !GuardDecl->use_empty() &&
       BBI->getIterator() != BB->begin()) {
-    for (Instruction &I : make_range(std::next(BBI->getIterator().getReverse()),
-                                     BB->rend())) {
+    for (Instruction &I :
+         make_range(std::next(BBI->getIterator().getReverse()), BB->rend())) {
       Value *Cond = nullptr;
       if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(Cond))))
-        BBLV = intersect(BBLV, getValueFromCondition(Val, Cond));
+        BBLV = intersect(BBLV,
+                         *getValueFromCondition(Val, Cond, /*IsTrueDest*/ true,
+                                                /*UseBlockValue*/ false));
     }
   }
 
@@ -886,10 +908,14 @@ LazyValueInfoImpl::solveBlockValueSelect(SelectInst *SI, BasicBlock *BB) {
   // If the value is undef, a different value may be chosen in
   // the select condition.
   if (isGuaranteedNotToBeUndef(Cond, AC)) {
-    TrueVal = intersect(TrueVal,
-                        getValueFromCondition(SI->getTrueValue(), Cond, true));
-    FalseVal = intersect(
-        FalseVal, getValueFromCondition(SI->getFalseValue(), Cond, false));
+    TrueVal =
+        intersect(TrueVal, *getValueFromCondition(SI->getTrueValue(), Cond,
+                                                  /*IsTrueDest*/ true,
+                                                  /*UseBlockValue*/ false));
+    FalseVal =
+        intersect(FalseVal, *getValueFromCondition(SI->getFalseValue(), Cond,
+                                                   /*IsTrueDest*/ false,
+                                                   /*UseBlockValue*/ false));
   }
 
   ValueLatticeElement Result = TrueVal;
@@ -1068,15 +1094,26 @@ static bool matchICmpOperand(APInt &Offset, Value *LHS, Value *Val,
 }
 
 /// Get value range for a "(Val + Offset) Pred RHS" condition.
-static ValueLatticeElement getValueFromSimpleICmpCondition(
-    CmpInst::Predicate Pred, Value *RHS, const APInt &Offset) {
+std::optional<ValueLatticeElement>
+LazyValueInfoImpl::getValueFromSimpleICmpCondition(CmpInst::Predicate Pred,
+                                                   Value *RHS,
+                                                   const APInt &Offset,
+                                                   Instruction *CxtI,
+                                                   bool UseBlockValue) {
   ConstantRange RHSRange(RHS->getType()->getIntegerBitWidth(),
                          /*isFullSet=*/true);
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS))
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
     RHSRange = ConstantRange(CI->getValue());
-  else if (Instruction *I = dyn_cast<Instruction>(RHS))
+  } else if (UseBlockValue) {
+    std::optional<ValueLatticeElement> R =
+        getBlockValue(RHS, CxtI->getParent(), CxtI);
+    if (!R)
+      return std::nullopt;
+    RHSRange = toConstantRange(*R, RHS->getType());
+  } else if (Instruction *I = dyn_cast<Instruction>(RHS)) {
     if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
       RHSRange = getConstantRangeFromMetadata(*Ranges);
+  }
 
   ConstantRange TrueValues =
       ConstantRange::makeAllowedICmpRegion(Pred, RHSRange);
@@ -1103,8 +1140,8 @@ getRangeViaSLT(CmpInst::Predicate Pred, APInt RHS,
   return std::nullopt;
 }
 
-static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
-                                                     bool isTrueDest) {
+std::optional<ValueLatticeElement> LazyValueInfoImpl::getValueFromICmpCondition(
+    Value *Val, ICmpInst *ICI, bool isTrueDest, bool UseBlockValue) {
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
 
@@ -1128,11 +1165,13 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
   unsigned BitWidth = Ty->getScalarSizeInBits();
   APInt Offset(BitWidth, 0);
   if (matchICmpOperand(Offset, LHS, Val, EdgePred))
-    return getValueFromSimpleICmpCondition(EdgePred, RHS, Offset);
+    return getValueFromSimpleICmpCondition(EdgePred, RHS, Offset, ICI,
+                                           UseBlockValue);
 
   CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(EdgePred);
   if (matchICmpOperand(Offset, RHS, Val, SwappedPred))
-    return getValueFromSimpleICmpCondition(SwappedPred, LHS, Offset);
+    return getValueFromSimpleICmpCondition(SwappedPred, LHS, Offset, ICI,
+                                           UseBlockValue);
 
   const APInt *Mask, *C;
   if (match(LHS, m_And(m_Specific(Val), m_APInt(Mask))) &&
@@ -1212,10 +1251,12 @@ static ValueLatticeElement getValueFromOverflowCondition(
   return ValueLatticeElement::getRange(NWR);
 }
 
-static ValueLatticeElement getValueFromCondition(
-    Value *Val, Value *Cond, bool IsTrueDest, unsigned Depth) {
+std::optional<ValueLatticeElement>
+LazyValueInfoImpl::getValueFromCondition(Value *Val, Value *Cond,
+                                         bool IsTrueDest, bool UseBlockValue,
+                                         unsigned Depth) {
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(Cond))
-    return getValueFromICmpCondition(Val, ICI, IsTrueDest);
+    return getValueFromICmpCondition(Val, ICI, IsTrueDest, UseBlockValue);
 
   if (auto *EVI = dyn_cast<ExtractValueInst>(Cond))
     if (auto *WO = dyn_cast<WithOverflowInst>(EVI->getAggregateOperand()))
@@ -1227,7 +1268,7 @@ static ValueLatticeElement getValueFromCondition(
 
   Value *N;
   if (match(Cond, m_Not(m_Value(N))))
-    return getValueFromCondition(Val, N, !IsTrueDest, Depth);
+    return getValueFromCondition(Val, N, !IsTrueDest, UseBlockValue, Depth);
 
   Value *L, *R;
   bool IsAnd;
@@ -1238,19 +1279,23 @@ static ValueLatticeElement getValueFromCondition(
   else
     return ValueLatticeElement::getOverdefined();
 
-  ValueLatticeElement LV = getValueFromCondition(Val, L, IsTrueDest, Depth);
-  ValueLatticeElement RV = getValueFromCondition(Val, R, IsTrueDest, Depth);
+  std::optional<ValueLatticeElement> LV =
+      getValueFromCondition(Val, L, IsTrueDest, UseBlockValue, Depth);
+  std::optional<ValueLatticeElement> RV =
+      getValueFromCondition(Val, R, IsTrueDest, UseBlockValue, Depth);
+  if (!LV || !RV)
+    return std::nullopt;
 
   // if (L && R) -> intersect L and R
   // if (!(L || R)) -> intersect !L and !R
   // if (L || R) -> union L and R
   // if (!(L && R)) -> union !L and !R
   if (IsTrueDest ^ IsAnd) {
-    LV.mergeIn(RV);
-    return LV;
+    LV->mergeIn(*RV);
+    return *LV;
   }
 
-  return intersect(LV, RV);
+  return intersect(*LV, *RV);
 }
 
 // Return true if Usr has Op as an operand, otherwise false.
@@ -1302,8 +1347,9 @@ static ValueLatticeElement constantFoldUser(User *Usr, Value *Op,
 }
 
 /// Compute the value of Val on the edge BBFrom -> BBTo.
-static ValueLatticeElement getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
-                                             BasicBlock *BBTo) {
+std::optional<ValueLatticeElement>
+LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
+                                     BasicBlock *BBTo, bool UseBlockValue) {
   // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we
   // know that v != 0.
   if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) {
@@ -1324,13 +1370,16 @@ static ValueLatticeElement getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
 
       // If the condition of the branch is an equality comparison, we may be
       // able to infer the value.
-      ValueLatticeElement Result = getValueFromCondition(Val, Condition,
-                                                         isTrueDest);
-      if (!Result.isOverdefined())
+      std::optional<ValueLatticeElement> Result =
+          getValueFromCondition(Val, Condition, isTrueDest, UseBlockValue);
+      if (!Result)
+        return std::nullopt;
+
+      if (!Result->isOverdefined())
         return Result;
 
       if (User *Usr = dyn_cast<User>(Val)) {
-        assert(Result.isOverdefined() && "Result isn't overdefined");
+        assert(Result->isOverdefined() && "Result isn't overdefined");
         // Check with isOperationFoldable() first to avoid linearly iterating
         // over the operands unnecessarily which can be expensive for
         // instructions with many operands.
@@ -1356,8 +1405,8 @@ static ValueLatticeElement getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
             //    br i1 %Condition, label %then, label %else
             for (unsigned i = 0; i < Usr->getNumOperands(); ++i) {
               Value *Op = Usr->getOperand(i);
-              ValueLatticeElement OpLatticeVal =
-                  getValueFromCondition(Op, Condition, isTrueDest);
+              ValueLatticeElement OpLatticeVal = *getValueFromCondition(
+                  Op, Condition, isTrueDest, /*UseBlockValue*/ false);
               if (std::optional<APInt> OpConst =
                       OpLatticeVal.asConstantInteger()) {
                 Result = constantFoldUser(Usr, Op, *OpConst, DL);
@@ -1367,7 +1416,7 @@ static ValueLatticeElement getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
           }
         }
       }
-      if (!Result.isOverdefined())
+      if (!Result->isOverdefined())
         return Result;
     }
   }
@@ -1432,8 +1481,12 @@ LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   if (Constant *VC = dyn_cast<Constant>(Val))
     return ValueLatticeElement::get(VC);
 
-  ValueLatticeElement LocalResult = getEdgeValueLocal(Val, BBFrom, BBTo);
-  if (hasSingleValue(LocalResult))
+  std::optional<ValueLatticeElement> LocalResult =
+      getEdgeValueLocal(Val, BBFrom, BBTo, /*UseBlockValue*/ true);
+  if (!LocalResult)
+    return std::nullopt;
+
+  if (hasSingleValue(*LocalResult))
     // Can't get any more precise here
     return LocalResult;
 
@@ -1453,7 +1506,7 @@ LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   // but then the result is not cached.
   intersectAssumeOrGuardBlockValueConstantRange(Val, InBlock, CxtI);
 
-  return intersect(LocalResult, InBlock);
+  return intersect(*LocalResult, InBlock);
 }
 
 ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
@@ -1499,10 +1552,12 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
 
   std::optional<ValueLatticeElement> Result =
       getEdgeValue(V, FromBB, ToBB, CxtI);
-  if (!Result) {
+  while (!Result) {
+    // As the worklist only explicitly tracks block values (but not edge values)
+    // we may have to call solve() multiple times, as the edge value calculation
+    // may request additional block values.
     solve();
     Result = getEdgeValue(V, FromBB, ToBB, CxtI);
-    assert(Result && "More work to do after problem solved?");
   }
 
   LLVM_DEBUG(dbgs() << "  Result = " << *Result << "\n");
@@ -1528,13 +1583,17 @@ ValueLatticeElement LazyValueInfoImpl::getValueAtUse(const Use &U) {
       if (!isGuaranteedNotToBeUndef(SI->getCondition(), AC))
         break;
       if (CurrU->getOperandNo() == 1)
-        CondVal = getValueFromCondition(V, SI->getCondition(), true);
+        CondVal =
+            *getValueFromCondition(V, SI->getCondition(), /*IsTrueDest*/ true,
+                                   /*UseBlockValue*/ false);
       else if (CurrU->getOperandNo() == 2)
-        CondVal = getValueFromCondition(V, SI->getCondition(), false);
+        CondVal =
+            *getValueFromCondition(V, SI->getCondition(), /*IsTrueDest*/ false,
+                                   /*UseBlockValue*/ false);
     } else if (auto *PHI = dyn_cast<PHINode>(CurrI)) {
       // TODO: Use non-local query?
-      CondVal =
-          getEdgeValueLocal(V, PHI->getIncomingBlock(*CurrU), PHI->getParent());
+      CondVal = *getEdgeValueLocal(V, PHI->getIncomingBlock(*CurrU),
+                                   PHI->getParent(), /*UseBlockValue*/ false);
     }
     if (CondVal)
       VL = intersect(VL, *CondVal);
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll b/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll
index d30b31d317a6d..252f6596cedc5 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll
@@ -12,8 +12,7 @@ define void @test_icmp_from_implied_cond(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[B]], [[A]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[L2:%.*]], label [[END]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[B_CMP1:%.*]] = icmp ult i32 [[B]], 32
-; CHECK-NEXT:    call void @use(i1 [[B_CMP1]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[B_CMP2:%.*]] = icmp ult i32 [[B]], 31
 ; CHECK-NEXT:    call void @use(i1 [[B_CMP2]])
 ; CHECK-NEXT:    ret void
@@ -47,7 +46,7 @@ define i64 @test_sext_from_implied_cond(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[B]], [[A]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[L2:%.*]], label [[END]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[SEXT:%.*]] = zext nneg i32 [[B]] to i64
 ; CHECK-NEXT:    ret i64 [[SEXT]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i64 0
@@ -74,8 +73,7 @@ define void @test_icmp_from_implied_range(i16 %x, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[B]], [[A]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[L1:%.*]], label [[END:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[B_CMP1:%.*]] = icmp ult i32 [[B]], 65535
-; CHECK-NEXT:    call void @use(i1 [[B_CMP1]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[B_CMP2:%.*]] = icmp ult i32 [[B]], 65534
 ; CHECK-NEXT:    call void @use(i1 [[B_CMP2]])
 ; CHECK-NEXT:    ret void

From ac8b53fc9232733af4656028fa82fd44397559d0 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 2 Jan 2024 10:00:29 +0000
Subject: [PATCH 013/313] [mlir] Apply ClangTidy performance fix

- Use '\n' instead of std::endl;

https://clang.llvm.org/extra/clang-tidy/checks/performance/avoid-endl.html
---
 mlir/lib/ExecutionEngine/AsyncRuntime.cpp        | 2 +-
 mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/AsyncRuntime.cpp b/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
index 9d7e4000fe3be..24835c555b682 100644
--- a/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
@@ -446,7 +446,7 @@ extern "C" int64_t mlirAsyncRuntimGetNumWorkerThreads() {
 
 extern "C" void mlirAsyncRuntimePrintCurrentThreadId() {
   static thread_local std::thread::id thisId = std::this_thread::get_id();
-  std::cout << "Current thread id: " << thisId << std::endl;
+  std::cout << "Current thread id: " << thisId << '\n';
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index 2dcc5d22e2291..e7ac8f161875d 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -450,10 +450,10 @@ void _mlir_ciface_outSparseTensorWriterMetaData(
   assert(dimRank != 0);
   index_type *dimSizes = MEMREF_GET_PAYLOAD(dimSizesRef);
   std::ostream &file = *static_cast<std::ostream *>(p);
-  file << dimRank << " " << nse << std::endl;
+  file << dimRank << " " << nse << '\n';
   for (index_type d = 0; d < dimRank - 1; d++)
     file << dimSizes[d] << " ";
-  file << dimSizes[dimRank - 1] << std::endl;
+  file << dimSizes[dimRank - 1] << '\n';
 }
 
 #define IMPL_OUTNEXT(VNAME, V)                                                 \
@@ -468,7 +468,7 @@ void _mlir_ciface_outSparseTensorWriterMetaData(
     for (index_type d = 0; d < dimRank; d++)                                   \
       file << (dimCoords[d] + 1) << " ";                                       \
     V *value = MEMREF_GET_PAYLOAD(vref);                                       \
-    file << *value << std::endl;                                               \
+    file << *value << '\n';                                                    \
   }
 MLIR_SPARSETENSOR_FOREVERY_V(IMPL_OUTNEXT)
 #undef IMPL_OUTNEXT

From aa6bb1697f2ef0881ae11cd5351d980fc98a4a14 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 2 Jan 2024 11:08:41 +0100
Subject: [PATCH 014/313] [CVP] Add test for #76705 (NFC)

---
 .../CorrelatedValuePropagation/basic.ll           | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index 9fcf7c320f62d..7b9375784dae8 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -1910,6 +1910,21 @@ exit:
   ret i1 false
 }
 
+define i1 @binop_eval_order(i32 %x) {
+; CHECK-LABEL: @binop_eval_order(
+; CHECK-NEXT:    [[A:%.*]] = add nuw nsw i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[B:%.*]] = add nuw nsw i32 [[A]], 1
+; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = icmp ugt i32 [[C]], 2
+; CHECK-NEXT:    ret i1 [[D]]
+;
+  %a = add nuw nsw i32 %x, 1
+  %b = add nuw nsw i32 %a, 1
+  %c = add nuw nsw i32 %a, %b
+  %d = icmp ugt i32 %c, 2
+  ret i1 %d
+}
+
 declare i32 @llvm.uadd.sat.i32(i32, i32)
 declare i32 @llvm.usub.sat.i32(i32, i32)
 declare i32 @llvm.sadd.sat.i32(i32, i32)

From d5db2cdb22ab302acbb6e1a066e791f25dc612de Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 2 Jan 2024 10:55:59 +0100
Subject: [PATCH 015/313] [LVI] Don't push both binop operands at once

If one of the binop operands depends on the other, this may end
up evaluating them in the wrong order, producing sub-optimal
results.

Make sure that only one unevaluated operand gets pushed per
iteration.

Fixes https://github.com/llvm/llvm-project/issues/76705.
---
 llvm/lib/Analysis/LazyValueInfo.cpp                      | 6 ++++--
 llvm/test/Transforms/CorrelatedValuePropagation/basic.ll | 3 +--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index f7d8771648227..c94e29cabc3fe 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -976,9 +976,11 @@ LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
   // lets us pick up facts from expressions like "and i32 (call i32
   // @foo()), 32"
   std::optional<ConstantRange> LHSRes = getRangeFor(I->getOperand(0), I, BB);
+  if (!LHSRes)
+    return std::nullopt;
+
   std::optional<ConstantRange> RHSRes = getRangeFor(I->getOperand(1), I, BB);
-  if (!LHSRes || !RHSRes)
-    // More work to do before applying this transfer rule.
+  if (!RHSRes)
     return std::nullopt;
 
   const ConstantRange &LHSRange = *LHSRes;
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index 7b9375784dae8..6227a5c822b10 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -1915,8 +1915,7 @@ define i1 @binop_eval_order(i32 %x) {
 ; CHECK-NEXT:    [[A:%.*]] = add nuw nsw i32 [[X:%.*]], 1
 ; CHECK-NEXT:    [[B:%.*]] = add nuw nsw i32 [[A]], 1
 ; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[D:%.*]] = icmp ugt i32 [[C]], 2
-; CHECK-NEXT:    ret i1 [[D]]
+; CHECK-NEXT:    ret i1 true
 ;
   %a = add nuw nsw i32 %x, 1
   %b = add nuw nsw i32 %a, 1

From a3e8e86fb6ad27fe070bb2c0f54a1c697c665f13 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 2 Jan 2024 11:28:48 +0100
Subject: [PATCH 016/313] [LVI] Don't push both sides of and/or at once

Same as the change in d5db2cdb22ab302acbb6e1a066e791f25dc612de,
but for condition handling. The same issue could occur here as well.
---
 llvm/lib/Analysis/LazyValueInfo.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index c94e29cabc3fe..360fc594ef7ca 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1283,9 +1283,11 @@ LazyValueInfoImpl::getValueFromCondition(Value *Val, Value *Cond,
 
   std::optional<ValueLatticeElement> LV =
       getValueFromCondition(Val, L, IsTrueDest, UseBlockValue, Depth);
+  if (!LV)
+    return std::nullopt;
   std::optional<ValueLatticeElement> RV =
       getValueFromCondition(Val, R, IsTrueDest, UseBlockValue, Depth);
-  if (!LV || !RV)
+  if (!RV)
     return std::nullopt;
 
   // if (L && R) -> intersect L and R

From 21a0335110a330846f738be37aeccb8685082faf Mon Sep 17 00:00:00 2001
From: OCHyams <orlando.hyams@sony.com>
Date: Tue, 2 Jan 2024 10:51:00 +0000
Subject: [PATCH 017/313] [NFC][RemoveDIs] Fix typo in disabled test run line

The disabled line should be checking FastISel but was incorrectly checking
SelectionDAG due to a copy-paste error in #73496.
---
 llvm/test/DebugInfo/X86/sret.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/DebugInfo/X86/sret.ll b/llvm/test/DebugInfo/X86/sret.ll
index 087f136541d93..f06b097d07b98 100644
--- a/llvm/test/DebugInfo/X86/sret.ll
+++ b/llvm/test/DebugInfo/X86/sret.ll
@@ -12,9 +12,9 @@
 ; RUN: llc -O0 -fast-isel=true -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,FASTISEL %s
 ; RUN: llc -O0 -fast-isel=false -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,SDAG %s
 
-; RUN: llc --try-experimental-debuginfo-iterators -O0 -fast-isel=false -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,SDAG %s
 ;; FIXME: RemoveDIs - enable when FastISel support is added.
-; run: llc --try-experimental-debuginfo-iterators -O0 -fast-isel=false -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,SDAG %s
+; run: llc --try-experimental-debuginfo-iterators -O0 -fast-isel=true -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,FASTISEL %s
+; RUN: llc --try-experimental-debuginfo-iterators -O0 -fast-isel=false -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,SDAG %s
 
 ; CHECK: _ZN1B9AInstanceEv
 ; CHECK: DW_TAG_variable

From 9b7cf5bfb08b6e506216ef354dfd61adb15acbff Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 2 Jan 2024 11:09:18 +0000
Subject: [PATCH 018/313] [Flang] Allow Intrinsic simpification with min/maxloc
 dim and scalar result (#76194)

This makes an adjustment to the existing fir minloc/maxloc generation
code to handle functions with a dim=1 that produce a scalar result. This
should allow us to get the same benefits as the existing generated
minmax reductions.

This is a recommit of #75820 with the typename added to the generated
function.
---
 .../Transforms/SimplifyIntrinsics.cpp         | 15 ++--
 flang/test/Transforms/simplifyintrinsics.fir  | 68 +++++++++++++++++--
 2 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index c89ee6d5e2039..f5ddcbbaecd21 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -1162,11 +1162,14 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
 
   mlir::Operation::operand_range args = call.getArgs();
 
-  mlir::Value back = args[6];
+  mlir::SymbolRefAttr callee = call.getCalleeAttr();
+  mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
+  bool isDim = funcNameBase.ends_with("Dim");
+  mlir::Value back = args[isDim ? 7 : 6];
   if (isTrueOrNotConstant(back))
     return;
 
-  mlir::Value mask = args[5];
+  mlir::Value mask = args[isDim ? 6 : 5];
   mlir::Value maskDef = findMaskDef(mask);
 
   // maskDef is set to NULL when the defining op is not one we accept.
@@ -1175,10 +1178,8 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   if (maskDef == NULL)
     return;
 
-  mlir::SymbolRefAttr callee = call.getCalleeAttr();
-  mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
   unsigned rank = getDimCount(args[1]);
-  if (funcNameBase.ends_with("Dim") || !(rank > 0))
+  if ((isDim && rank != 1) || !(rank > 0))
     return;
 
   fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
@@ -1219,6 +1220,8 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
 
   llvm::raw_string_ostream nameOS(funcName);
   outType.print(nameOS);
+  if (isDim)
+    nameOS << '_' << inputType;
   nameOS << '_' << fmfString;
 
   auto typeGenerator = [rank](fir::FirOpBuilder &builder) {
@@ -1234,7 +1237,7 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   mlir::func::FuncOp newFunc =
       getOrCreateFunction(builder, funcName, typeGenerator, bodyGenerator);
   builder.create<fir::CallOp>(loc, newFunc,
-                              mlir::ValueRange{args[0], args[1], args[5]});
+                              mlir::ValueRange{args[0], args[1], mask});
   call->dropAllReferences();
   call->erase();
 }
diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir
index 0bd6ac7c436ff..61cddd4f48df8 100644
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@@ -2115,13 +2115,13 @@ func.func @_QPtestminloc_doesntwork1d_back(%arg0: !fir.ref<!fir.array<10xi32>> {
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
-// Check Minloc is not simplified when DIM arg is set
+// Check Minloc is simplified when DIM arg is set so long as the result is scalar
 
-func.func @_QPtestminloc_doesntwork1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
+func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
   %0 = fir.alloca !fir.box<!fir.heap<i32>>
   %c10 = arith.constant 10 : index
   %c1 = arith.constant 1 : index
-  %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_doesntwork1d_dim", uniq_name = "_QFtestminloc_doesntwork1d_dimEtestminloc_doesntwork1d_dim"}
+  %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_1d_dim", uniq_name = "_QFtestminloc_1d_dimEtestminloc_1d_dim"}
   %2 = fir.shape %c1 : (index) -> !fir.shape<1>
   %3 = fir.array_load %1(%2) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.array<1xi32>
   %4 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -2156,11 +2156,65 @@ func.func @_QPtestminloc_doesntwork1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {f
   %21 = fir.load %1 : !fir.ref<!fir.array<1xi32>>
   return %21 : !fir.array<1xi32>
 }
-// CHECK-LABEL:   func.func @_QPtestminloc_doesntwork1d_dim(
+// CHECK-LABEL:   func.func @_QPtestminloc_1d_dim(
 // CHECK-SAME:                                             %[[ARR:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
-// CHECK-NOT:         fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-// CHECK:             fir.call @_FortranAMinlocDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
-// CHECK-NOT:         fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+// CHECK:             fir.call @_FortranAMinlocDimx1_i32_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+
+// CHECK-LABEL:  func.func private @_FortranAMinlocDimx1_i32_i32_contract_simplified(%arg0: !fir.ref<!fir.box<none>>, %arg1: !fir.box<none>, %arg2: !fir.box<none>) attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK-NEXT:    %[[V0:.*]] = fir.alloca i32
+// CHECK-NEXT:    %c0_i32 = arith.constant 0 : i32
+// CHECK-NEXT:    %c1 = arith.constant 1 : index
+// CHECK-NEXT:    %[[V1:.*]] = fir.allocmem !fir.array<1xi32>
+// CHECK-NEXT:    %[[V2:.*]] = fir.shape %c1 : (index) -> !fir.shape<1>
+// CHECK-NEXT:    %[[V3:.*]] = fir.embox %[[V1]](%[[V2]]) : (!fir.heap<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<1xi32>>>
+// CHECK-NEXT:    %c0 = arith.constant 0 : index
+// CHECK-NEXT:    %[[V4:.*]] = fir.coordinate_of %[[V3]], %c0 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT:    fir.store %c0_i32 to %[[V4]] : !fir.ref<i32>
+// CHECK-NEXT:    %c0_0 = arith.constant 0 : index
+// CHECK-NEXT:    %[[V5:.*]] = fir.convert %arg1 : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
+// CHECK-NEXT:    %c1_i32 = arith.constant 1 : i32
+// CHECK-NEXT:    %c0_i32_1 = arith.constant 0 : i32
+// CHECK-NEXT:    fir.store %c0_i32_1 to %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT:    %c2147483647_i32 = arith.constant 2147483647 : i32
+// CHECK-NEXT:    %c1_2 = arith.constant 1 : index
+// CHECK-NEXT:    %c0_3 = arith.constant 0 : index
+// CHECK-NEXT:    %[[V6:.*]]:3 = fir.box_dims %[[V5]], %c0_3 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK-NEXT:    %[[V7:.*]] = arith.subi %[[V6]]#1, %c1_2 : index
+// CHECK-NEXT:    %[[V8:.*]] = fir.do_loop %arg3 = %c0_0 to %[[V7]] step %c1_2 iter_args(%arg4 = %c2147483647_i32) -> (i32) {
+// CHECK-NEXT:      fir.store %c1_i32 to %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT:      %[[V12:.*]] = fir.coordinate_of %[[V5]], %arg3 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK-NEXT:      %[[V13:.*]] = fir.load %[[V12]] : !fir.ref<i32>
+// CHECK-NEXT:      %[[V14:.*]] = arith.cmpi slt, %[[V13]], %arg4 : i32
+// CHECK-NEXT:      %[[V15:.*]] = fir.if %[[V14]] -> (i32) {
+// CHECK-NEXT:        %c1_i32_4 = arith.constant 1 : i32
+// CHECK-NEXT:        %c0_5 = arith.constant 0 : index
+// CHECK-NEXT:        %[[V16:.*]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT:        %[[V17:.*]] = fir.convert %arg3 : (index) -> i32
+// CHECK-NEXT:        %[[V18:.*]] = arith.addi %[[V17]], %c1_i32_4 : i32
+// CHECK-NEXT:        fir.store %[[V18]] to %[[V16]] : !fir.ref<i32>
+// CHECK-NEXT:        fir.result %[[V13]] : i32
+// CHECK-NEXT:      } else {
+// CHECK-NEXT:        fir.result %arg4 : i32
+// CHECK-NEXT:      }
+// CHECK-NEXT:      fir.result %[[V15]] : i32
+// CHECK-NEXT:    }
+// CHECK-NEXT:    %[[V9:.*]] = fir.load %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT:    %[[V10:.*]] = arith.cmpi eq, %[[V9]], %c1_i32 : i32
+// CHECK-NEXT:    fir.if %[[V10]] {
+// CHECK-NEXT:      %c2147483647_i32_4 = arith.constant 2147483647 : i32
+// CHECK-NEXT:      %[[V12]] = arith.cmpi eq, %c2147483647_i32_4, %[[V8]] : i32
+// CHECK-NEXT:      fir.if %[[V12]] {
+// CHECK-NEXT:        %c0_5 = arith.constant 0 : index
+// CHECK-NEXT:        %[[V13]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT:        fir.store %c1_i32 to %[[V13]] : !fir.ref<i32>
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }
+// CHECK-NEXT:    %[[V11:.*]] = fir.convert %arg0 : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<1xi32>>>>
+// CHECK-NEXT:    fir.store %[[V3]] to %[[V11]] : !fir.ref<!fir.box<!fir.heap<!fir.array<1xi32>>>>
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
+
+
 
 // -----
 // Check Minloc is not simplified when dimension of inputArr is unknown

From 2eb0ac0b3e3c74875e9b376239a27b8eb389189c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 2 Jan 2024 11:21:18 +0000
Subject: [PATCH 019/313] [CostModel][X86] Add explicit Silvermont test
 coverage for bswap costs

Test coverage for #62659 - we need to split SLM costs from other SSSE3 targets
---
 .../Analysis/CostModel/X86/bswap-codesize.ll  | 23 +++++++++++++++++++
 .../Analysis/CostModel/X86/bswap-latency.ll   | 23 +++++++++++++++++++
 .../CostModel/X86/bswap-sizelatency.ll        | 23 +++++++++++++++++++
 llvm/test/Analysis/CostModel/X86/bswap.ll     | 23 +++++++++++++++++++
 4 files changed, 92 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/X86/bswap-codesize.ll b/llvm/test/Analysis/CostModel/X86/bswap-codesize.ll
index 21789ca583648..1d7f19d1664e1 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-codesize.ll
@@ -7,6 +7,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ;
 ; bswap(X)
@@ -68,6 +70,13 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.bswap.i64(i64 %a64)
   %V2I64  = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
@@ -132,6 +141,13 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.bswap.i32(i32 %a32)
   %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
@@ -196,6 +212,13 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.bswap.i16(i16 %a16)
   %V8I16  = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
diff --git a/llvm/test/Analysis/CostModel/X86/bswap-latency.ll b/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
index 370369cf7e93d..2d4e660442581 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
@@ -7,6 +7,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ;
 ; bswap(X)
@@ -68,6 +70,13 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.bswap.i64(i64 %a64)
   %V2I64  = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
@@ -132,6 +141,13 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.bswap.i32(i32 %a32)
   %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
@@ -196,6 +212,13 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.bswap.i16(i16 %a16)
   %V8I16  = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
diff --git a/llvm/test/Analysis/CostModel/X86/bswap-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/bswap-sizelatency.ll
index 7c53e250c68fd..c6cfafb204620 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-sizelatency.ll
@@ -7,6 +7,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ;
 ; bswap(X)
@@ -68,6 +70,13 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.bswap.i64(i64 %a64)
   %V2I64  = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
@@ -132,6 +141,13 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.bswap.i32(i32 %a32)
   %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
@@ -196,6 +212,13 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.bswap.i16(i16 %a16)
   %V8I16  = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
diff --git a/llvm/test/Analysis/CostModel/X86/bswap.ll b/llvm/test/Analysis/CostModel/X86/bswap.ll
index 2894c884bd7b4..1c1497965e839 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap.ll
@@ -7,6 +7,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ;
 ; bswap(X)
@@ -68,6 +70,13 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I64    = call i64 @llvm.bswap.i64(i64 %a64)
   %V2I64  = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
@@ -132,6 +141,13 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I32   = call i32 @llvm.bswap.i32(i32 %a32)
   %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
@@ -196,6 +212,13 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'cost_bswap_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.bswap.i16(i16 %a16)
   %V8I16  = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)

From c1764a7842aca1642dfc9942612059bdab440e51 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 2 Jan 2024 18:43:00 +0700
Subject: [PATCH 020/313] AMDGPU: Add bf16 vectors to register class
 definitions (#76214)

Assorted intrinsics are currently using i16 in place of a proper
bfloat type, but they should really switch to bfloat.

Note this only changes the type lists in tablegen, these are still
not registered to be truly treated as a legal type yet.

Depends #76213
---
 llvm/lib/Target/AMDGPU/SIInstructions.td    | 10 ++++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td    | 62 ++++++++++-----------
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 22 ++++----
 3 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9bc623abcd04..8310c6b57dad5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1487,8 +1487,18 @@ foreach Index = 0-31 in {
 // 16-bit bitcast
 def : BitConvert <i16, f16, VGPR_32>;
 def : BitConvert <f16, i16, VGPR_32>;
+def : BitConvert <f16, bf16, VGPR_32>;
+def : BitConvert <bf16, f16, VGPR_32>;
+
 def : BitConvert <i16, f16, SReg_32>;
 def : BitConvert <f16, i16, SReg_32>;
+def : BitConvert <f16, bf16, SReg_32>;
+def : BitConvert <bf16, f16, SReg_32>;
+
+def : BitConvert <i16, bf16, VGPR_32>;
+def : BitConvert <bf16, i16, VGPR_32>;
+def : BitConvert <i16, bf16, SReg_32>;
+def : BitConvert <bf16, i16, SReg_32>;
 
 // 32-bit bitcast
 def : BitConvert <i32, f32, VGPR_32>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 36480057b8194..c94b894c5841f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -376,7 +376,7 @@ def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
   let HasSGPR = 1;
 }
 
-def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
+def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> {
   let CopyCost = 1;
   let Size = 16;
   let isAllocatable = 0;
@@ -385,7 +385,7 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
 
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
-def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "SGPR%u_LO16", 0, 105))> {
   let AllocationPriority = 0;
   let Size = 16;
@@ -393,7 +393,7 @@ def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   let HasSGPR = 1;
 }
 
-def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "SGPR%u_HI16", 0, 105))> {
   let isAllocatable = 0;
   let Size = 16;
@@ -402,7 +402,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
 }
 
 // SGPR 32-bit registers
-def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "SGPR%u", 0, 105))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
@@ -451,14 +451,14 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
 def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
 
 // Trap handler TMP 32-bit registers
-def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
+def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "TTMP%u", 0, 15))> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
 // Trap handler TMP 16-bit registers
-def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "TTMP%u_LO16", 0, 15))> {
   let Size = 16;
   let isAllocatable = 0;
@@ -584,8 +584,8 @@ class RegisterTypes<list<ValueType> reg_types> {
   list<ValueType> types = reg_types;
 }
 
-def Reg16Types : RegisterTypes<[i16, f16]>;
-def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
+def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
+def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
 
 let HasVGPR = 1 in {
 // VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -683,7 +683,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
 }
 
 // AccVGPR 32-bit registers
-def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "AGPR%u", 0, 255))> {
   let AllocationPriority = 0;
   let Size = 32;
@@ -735,7 +735,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
 //  Register classes used as source and destination
 //===----------------------------------------------------------------------===//
 
-def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
   (add FP_REG, SP_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -743,7 +743,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16
   let BaseClassOrder = 10000;
 }
 
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -760,7 +760,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
 let GeneratePressureSet = 0, HasSGPR = 1 in {
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
    SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
    SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
@@ -769,7 +769,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2
   let AllocationPriority = 0;
 }
 
-def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
   (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
    XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
    TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
@@ -782,17 +782,17 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   let BaseClassOrder = 16;
 }
 
-def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0_XEXEC, M0_CLASS)> {
   let AllocationPriority = 0;
 }
 
-def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XEXEC, EXEC_LO)> {
   let AllocationPriority = 0;
 }
 
-def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 0;
 }
@@ -800,7 +800,7 @@ def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i
 } // End GeneratePressureSet = 0
 
 // Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0, M0_CLASS)> {
   let AllocationPriority = 0;
   let HasSGPR = 1;
@@ -808,13 +808,13 @@ def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1],
 }
 
 let GeneratePressureSet = 0 in {
-def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
   (add SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32,
                             (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -836,13 +836,13 @@ def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
   let HasSGPR = 1;
 }
 
-def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
+def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], 32,
                             (add TTMP_64Regs)> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
-def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
+def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
   (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, SRC_SHARED_BASE,
        SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
@@ -850,7 +850,7 @@ def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16
   let HasSGPR = 1;
 }
 
-def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
+def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -905,11 +905,11 @@ multiclass SRegClass<int numRegs,
 }
 
 defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>;
 defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
-defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>;
 defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
 defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
 defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
@@ -920,7 +920,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512
 defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
 }
 
-def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                                  (add VGPR_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
@@ -955,15 +955,15 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
   }
 }
 
-defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
+defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
                                 (add VGPR_64)>;
 defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
+defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>;
 defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
 
 defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
 defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
-defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
+defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], (add VGPR_256)>;
 defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>;
 defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>;
 defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
@@ -993,7 +993,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
 defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
                         (add AGPR_64)>;
 defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
-defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
+defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add AGPR_128)>;
 defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
 defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
 defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
@@ -1032,14 +1032,14 @@ def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
   let HasVGPR = 1;
 }
 
-def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                           (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
 }
 
-def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                           (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index d3cefb339d9e7..7f52501b5d903 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -190,9 +190,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   // because dealing with the write to high half of the register is
   // difficult.
   def : GCNPat <
-    (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+    (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                                     (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                                     (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
@@ -203,9 +203,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   def : GCNPat <
     (build_vector
       f16:$elt0,
-      (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+      (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
                                       (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
@@ -215,12 +215,12 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
 
   def : GCNPat <
     (AMDGPUclamp (build_vector
-      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+      (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
                          (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
-      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
+      (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
                          (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
     (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
                        $hi_src1_modifiers, $hi_src1,
                        $hi_src2_modifiers, $hi_src2,
@@ -243,8 +243,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   >;
 
   def : GCNPat <
-    (build_vector f16:$elt0, (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
-                                            (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+    (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
+                                            (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        (i32 0), (i32 0),

From 5842dfe34d89d9cc664d14f511ecb415d6609237 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 2 Jan 2024 12:06:35 +0000
Subject: [PATCH 021/313] [CostModel][X86] Update SSSE3/AVX1 BSWAP costs

Drop atom/slm costs from the default bswap costs, and update the avx1 latency costs based off latest codegen.

Based off analysis report from https://github.com/RKSimon/llvm-scripts/check_cost_tables.py

Fixes #62659
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 17 ++++---
 .../Analysis/CostModel/X86/bswap-latency.ll   | 48 +++++++++----------
 llvm/test/Analysis/CostModel/X86/bswap.ll     | 36 +++++++-------
 3 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index e09dc7ff02a07..49631f38017a1 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3733,10 +3733,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BITREVERSE, MVT::v8i16,   {  8, 13, 10, 16 } },
     { ISD::BITREVERSE, MVT::v32i8,   { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
     { ISD::BITREVERSE, MVT::v16i8,   {  7,  7,  9, 13 } },
-    { ISD::BSWAP,      MVT::v4i64,   {  5,  7,  5, 10 } },
-    { ISD::BSWAP,      MVT::v2i64,   {  2,  3,  1,  3 } },
-    { ISD::BSWAP,      MVT::v8i32,   {  5,  7,  5, 10 } },
-    { ISD::BSWAP,      MVT::v4i32,   {  2,  3,  1,  3 } },
+    { ISD::BSWAP,      MVT::v4i64,   {  5,  6,  5, 10 } },
+    { ISD::BSWAP,      MVT::v2i64,   {  2,  2,  1,  3 } },
+    { ISD::BSWAP,      MVT::v8i32,   {  5,  6,  5, 10 } },
+    { ISD::BSWAP,      MVT::v4i32,   {  2,  2,  1,  3 } },
     { ISD::BSWAP,      MVT::v16i16,  {  5,  6,  5, 10 } },
     { ISD::BSWAP,      MVT::v8i16,   {  2,  2,  1,  3 } },
     { ISD::CTLZ,       MVT::v4i64,   { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
@@ -3813,6 +3813,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FSQRT,      MVT::v2f64,   { 67, 71, 1, 5 } }, // sqrtpd
   };
   static const CostKindTblEntry SLMCostTbl[] = {
+    { ISD::BSWAP,      MVT::v2i64,   {  5,  5, 1, 5 } },
+    { ISD::BSWAP,      MVT::v4i32,   {  5,  5, 1, 5 } },
+    { ISD::BSWAP,      MVT::v8i16,   {  5,  5, 1, 5 } },
     { ISD::FSQRT,      MVT::f32,     { 20, 20, 1, 1 } }, // sqrtss
     { ISD::FSQRT,      MVT::v4f32,   { 40, 41, 1, 5 } }, // sqrtps
     { ISD::FSQRT,      MVT::f64,     { 35, 35, 1, 1 } }, // sqrtsd
@@ -3851,9 +3854,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 11, 21 } },
     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 11, 21 } },
     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 10, 16 } },
-    { ISD::BSWAP,      MVT::v2i64,   {  5,  5,  1,  5 } },
-    { ISD::BSWAP,      MVT::v4i32,   {  5,  5,  1,  5 } },
-    { ISD::BSWAP,      MVT::v8i16,   {  5,  5,  1,  5 } },
+    { ISD::BSWAP,      MVT::v2i64,   {  2,  3,  1,  5 } },
+    { ISD::BSWAP,      MVT::v4i32,   {  2,  3,  1,  5 } },
+    { ISD::BSWAP,      MVT::v8i16,   {  2,  3,  1,  5 } },
     { ISD::CTLZ,       MVT::v2i64,   { 18, 28, 28, 35 } },
     { ISD::CTLZ,       MVT::v4i32,   { 15, 20, 22, 28 } },
     { ISD::CTLZ,       MVT::v8i16,   { 13, 17, 16, 22 } },
diff --git a/llvm/test/Analysis/CostModel/X86/bswap-latency.ll b/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
index 2d4e660442581..592541f4816f2 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
@@ -24,23 +24,23 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ;
 ; SSSE3-LABEL: 'cost_bswap_i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i64'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_bswap_i64'
@@ -95,23 +95,23 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ;
 ; SSSE3-LABEL: 'cost_bswap_i32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_bswap_i32'
@@ -166,16 +166,16 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ;
 ; SSSE3-LABEL: 'cost_bswap_i16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i16'
diff --git a/llvm/test/Analysis/CostModel/X86/bswap.ll b/llvm/test/Analysis/CostModel/X86/bswap.ll
index 1c1497965e839..d1c7daa04cfa6 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap.ll
@@ -24,16 +24,16 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ;
 ; SSSE3-LABEL: 'cost_bswap_i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i64'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i64'
@@ -95,16 +95,16 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ;
 ; SSSE3-LABEL: 'cost_bswap_i32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i32'
@@ -166,16 +166,16 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ;
 ; SSSE3-LABEL: 'cost_bswap_i16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i16'

From cf025c767ebc4c505e21d268d65f3a1b1cbc25ce Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 2 Jan 2024 13:02:20 +0000
Subject: [PATCH 022/313] [AMDGPU] GFX12 global_atomic_ordered_add_b64
 instruction and intrinsic (#76149)

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 10 ++-
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |  1 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  1 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |  1 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td    | 11 +++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  1 +
 ...vm.amdgcn.global.atomic.ordered.add.b64.ll | 65 +++++++++++++++++++
 llvm/test/MC/AMDGPU/gfx11_unsupported.s       |  3 +
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s         | 24 +++++++
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  | 12 ++++
 10 files changed, 124 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cb48f54b13a6c..531b111235452 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+def global_ptr_ty : LLVMQualPointerType<1>;
+
 class AMDGPUReadPreloadRegisterIntrinsic
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
@@ -2353,10 +2355,10 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
     [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
-class AMDGPUAtomicRtn<LLVMType vt> : Intrinsic <
+class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
   [vt],
-  [llvm_anyptr_ty,    // vaddr
-   vt],               // vdata(VGPR)
+  [pt,  // vaddr
+   vt], // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
   [SDNPMemOperand]>;
 
@@ -2486,6 +2488,8 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
+def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
+
 def int_amdgcn_flat_atomic_fmin_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmax_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index eaf72d7157ee2..36e07d944c942 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -642,6 +642,7 @@ defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
 defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
+defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
 defm int_amdgcn_flat_atomic_fmax_num : noret_op;
 defm int_amdgcn_global_atomic_fmin_num : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c9412f720c62e..fba060464a6e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4690,6 +4690,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_flat_atomic_fmax_num:
     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
+    case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_ds_ordered_add:
     case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index beb670669581f..4cc8871a00fe1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -243,6 +243,7 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 0dd2b3f5c2c91..615f8cd54d8f9 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -926,9 +926,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
 
-} // End is_flat_global = 1
-
+let SubtargetPredicate = isGFX12Plus in {
+  defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
+} // End SubtargetPredicate = isGFX12Plus
 
+} // End is_flat_global = 1
 
 let SubtargetPredicate = HasFlatScratchInsts in {
 defm SCRATCH_LOAD_UBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>;
@@ -1529,6 +1531,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64>
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
 
+let OtherPredicates = [isGFX12Plus] in {
+  defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
+}
+
 let OtherPredicates = [isGFX10Plus] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -2654,6 +2660,7 @@ defm GLOBAL_ATOMIC_DEC_U64         : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_A
 defm GLOBAL_ATOMIC_MIN_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
 defm GLOBAL_ATOMIC_MAX_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
 defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
+defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
 
 // ENC_VSCRATCH.
 defm SCRATCH_LOAD_U8               : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f3547db9e9bd9..f1002c738565d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1240,6 +1240,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_global_atomic_fmax:
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
+  case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
   case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
new file mode 100644
index 0000000000000..6a6c5b33e0dd8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64)
+
+define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) {
+; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 -4
+  %unused = call i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1) %gep, i64 %in)
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) {
+; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1) %gep, i64 %in)
+  store i64 %val, ptr addrspace(1) %use
+  ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index 89078c1ad4e04..e01eb05e85588 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -2013,3 +2013,6 @@ ds_sub_clamp_rtn_u32 v5, v1, v2
 
 ds_sub_clamp_u32 v1, v2
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
index c0ffc5247d90e..95d352b421a28 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
@@ -1266,6 +1266,30 @@ global_atomic_or_b64 v[1:2], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN
 global_atomic_or_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN
 // GFX12: encoding: [0x7c,0x80,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
+global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:-64
+// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64
+// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:-64
+// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:64
+// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
 global_atomic_sub_u32 v0, v2, s[0:1] offset:-64
 // GFX12: encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
index d7f9daf295845..f4038cf10f50d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
@@ -837,6 +837,18 @@
 # GFX12: global_atomic_xor_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 0x7c,0xc0,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
 
+# GFX12: global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
 # GFX12: global_load_addtid_b32 v1, off offset:64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
 0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
 

From 80aeb622117f6411e65bff42be5231720e8b8cef Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Tue, 2 Jan 2024 13:14:28 +0000
Subject: [PATCH 023/313] [llvm][NFC] Use SDValue::getConstantOperandVal(i)
 where possible (#76708)

This helper function shortens examples like
`cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();` to
`Node->getConstantOperandVal(1);`.

Implemented with:
`git grep -l
"cast<ConstantSDNode>\(.*->getOperand\(.*\)\)->getZExtValue\(\)" | xargs
sed -E -i

's/cast<ConstantSDNode>\((.*)->getOperand\((.*)\)\)->getZExtValue\(\)/\1->getConstantOperandVal(\2)/`
and `git grep -l
"cast<ConstantSDNode>\(.*\.getOperand\(.*\)\)->getZExtValue\(\)" | xargs
sed -E -i

's/cast<ConstantSDNode>\((.*)\.getOperand\((.*)\)\)->getZExtValue\(\)/\1.getConstantOperandVal(\2)/'`.
With a couple of simple manual fixes needed. Result then processed by
`git clang-format`.
---
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp |   9 +-
 .../CodeGen/SelectionDAG/ScheduleDAGFast.cpp  |   3 +-
 .../SelectionDAG/ScheduleDAGRRList.cpp        |   8 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   2 +-
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |   3 +-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  39 +++--
 .../Target/AArch64/AArch64ISelLowering.cpp    |  46 +++---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   8 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  18 ++-
 llvm/lib/Target/AMDGPU/R600ISelLowering.cpp   |   6 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  55 ++++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   4 +-
 llvm/lib/Target/ARC/ARCISelLowering.cpp       |   2 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp      |   9 +-
 llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp       |  28 ++--
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  65 ++++-----
 llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp       |   2 +-
 llvm/lib/Target/AVR/AVRISelLowering.cpp       |   5 +-
 llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp       |   4 +-
 llvm/lib/Target/CSKY/CSKYISelLowering.cpp     |   4 +-
 .../Target/Hexagon/HexagonISelDAGToDAG.cpp    |  34 ++---
 .../Target/Hexagon/HexagonISelDAGToDAGHVX.cpp |   6 +-
 .../Target/Hexagon/HexagonISelLowering.cpp    |   9 +-
 .../Target/Hexagon/HexagonISelLoweringHVX.cpp |   4 +-
 llvm/lib/Target/Lanai/LanaiISelLowering.cpp   |   4 +-
 .../LoongArch/LoongArchISelLowering.cpp       |  38 ++---
 llvm/lib/Target/M68k/M68kISelLowering.cpp     |   5 +-
 llvm/lib/Target/MSP430/MSP430ISelLowering.cpp |   6 +-
 llvm/lib/Target/Mips/MipsISelLowering.cpp     |   4 +-
 llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp   |   8 +-
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp   |   6 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |   6 +-
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |   5 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 134 ++++++++----------
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   6 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   8 +-
 llvm/lib/Target/Sparc/SparcISelLowering.cpp   |   4 +-
 .../Target/SystemZ/SystemZISelLowering.cpp    |  76 +++++-----
 llvm/lib/Target/SystemZ/SystemZOperators.td   |   4 +-
 llvm/lib/Target/VE/VEISelLowering.cpp         |  14 +-
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |   2 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   2 +-
 llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp   |   2 +-
 llvm/lib/Target/XCore/XCoreISelLowering.cpp   |  56 ++++----
 44 files changed, 356 insertions(+), 407 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index a27febe15db83..34fa1f5a7ed1f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -495,7 +495,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub.  There are no
     // constraints on the %dst register, COPY can target all legal register
     // classes.
-    unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned SubIdx = Node->getConstantOperandVal(1);
     const TargetRegisterClass *TRC =
       TLI->getRegClassFor(Node->getSimpleValueType(0), Node->isDivergent());
 
@@ -611,7 +611,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
   unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
 
   // Create the new VReg in the destination class and emit a copy.
-  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  unsigned DstRCIdx = Node->getConstantOperandVal(1);
   const TargetRegisterClass *DstRC =
     TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
   Register NewVReg = MRI->createVirtualRegister(DstRC);
@@ -629,7 +629,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
 void InstrEmitter::EmitRegSequence(SDNode *Node,
                                   DenseMap<SDValue, Register> &VRBaseMap,
                                   bool IsClone, bool IsCloned) {
-  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+  unsigned DstRCIdx = Node->getConstantOperandVal(0);
   const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
   Register NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
   const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
@@ -1309,8 +1309,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
 
     // Add all of the operand registers to the instruction.
     for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-      unsigned Flags =
-        cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+      unsigned Flags = Node->getConstantOperandVal(i);
       const InlineAsm::Flag F(Flags);
       const unsigned NumVals = F.getNumOperandRegisters();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index f73ddfee2b90f..e3acb58327a8c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -492,8 +492,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
         --NumOps;  // Ignore the glue operand.
 
       for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-        unsigned Flags =
-          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned Flags = Node->getConstantOperandVal(i);
         const InlineAsm::Flag F(Flags);
         unsigned NumVals = F.getNumOperandRegisters();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 47c137d2bcad7..dcecb2e0e7faf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -331,7 +331,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
     unsigned Opcode = Node->getMachineOpcode();
     if (Opcode == TargetOpcode::REG_SEQUENCE) {
-      unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+      unsigned DstRCIdx = Node->getConstantOperandVal(0);
       const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
       RegClass = RC->getID();
       Cost = RegSequenceCost;
@@ -1369,8 +1369,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
         --NumOps;  // Ignore the glue operand.
 
       for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-        unsigned Flags =
-          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned Flags = Node->getConstantOperandVal(i);
         const InlineAsm::Flag F(Flags);
         unsigned NumVals = F.getNumOperandRegisters();
 
@@ -2298,8 +2297,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
       continue;
     }
     if (POpc == TargetOpcode::REG_SEQUENCE) {
-      unsigned DstRCIdx =
-          cast<ConstantSDNode>(PN->getOperand(0))->getZExtValue();
+      unsigned DstRCIdx = PN->getConstantOperandVal(0);
       const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
       unsigned RCId = RC->getID();
       // REG_SEQUENCE is untyped, so getRepRegClassCostFor could not be used
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 81facf92e55ae..eb4deb6306fd5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7408,7 +7408,7 @@ static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
            Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
            Src.getOperand(1).getOpcode() == ISD::Constant) {
     G = cast<GlobalAddressSDNode>(Src.getOperand(0));
-    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
+    SrcDelta = Src.getConstantOperandVal(1);
   }
   if (!G)
     return false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3dc6e4bbcf46b..f28211ac113ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -4181,8 +4181,7 @@ void SelectionDAGISel::CannotYetSelect(SDNode *N) {
     Msg << "\nIn function: " << MF->getName();
   } else {
     bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
-    unsigned iid =
-      cast<ConstantSDNode>(N->getOperand(HasInputChain))->getZExtValue();
+    unsigned iid = N->getConstantOperandVal(HasInputChain);
     if (iid < Intrinsic::num_intrinsics)
       Msg << "intrinsic %" << Intrinsic::getBaseName((Intrinsic::ID)iid);
     else if (const TargetIntrinsicInfo *TII = TM.getIntrinsicInfo())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 463ec41b94e97..476d99c2a7e04 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1950,7 +1950,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs,
                                                 unsigned BaseReg, unsigned Op) {
   unsigned TileNum = 0;
   if (BaseReg != AArch64::ZA)
-    TileNum = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    TileNum = N->getConstantOperandVal(2);
 
   if (!SelectSMETile(BaseReg, TileNum))
     return;
@@ -2145,8 +2145,7 @@ void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
 
   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
 
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+  unsigned LaneNo = N->getConstantOperandVal(NumVecs + 2);
 
   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
                    N->getOperand(NumVecs + 3), N->getOperand(0)};
@@ -2185,8 +2184,7 @@ void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
   const EVT ResTys[] = {MVT::i64, // Type of the write back register
                         RegSeq->getValueType(0), MVT::Other};
 
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+  unsigned LaneNo = N->getConstantOperandVal(NumVecs + 1);
 
   SDValue Ops[] = {RegSeq,
                    CurDAG->getTargetConstant(LaneNo, dl,
@@ -2237,8 +2235,7 @@ void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
 
   SDValue RegSeq = createQTuple(Regs);
 
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+  unsigned LaneNo = N->getConstantOperandVal(NumVecs + 2);
 
   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
                    N->getOperand(NumVecs + 3), N->getOperand(0)};
@@ -2269,8 +2266,7 @@ void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
   const EVT ResTys[] = {MVT::i64, // Type of the write back register
                         MVT::Other};
 
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+  unsigned LaneNo = N->getConstantOperandVal(NumVecs + 1);
 
   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
                    N->getOperand(NumVecs + 2), // Base Register
@@ -2576,8 +2572,8 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
   case AArch64::UBFMXri:
     Opc = NOpc;
     Opd0 = N->getOperand(0);
-    Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    Immr = N->getConstantOperandVal(1);
+    Imms = N->getConstantOperandVal(2);
     return true;
   }
   // Unreachable
@@ -3877,7 +3873,7 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
         assert(isa<ConstantSDNode>(N->getOperand(2)) &&
                "Expected a constant integer expression.");
         unsigned Reg = PMapper->Encoding;
-        uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+        uint64_t Immed = N->getConstantOperandVal(2);
         CurDAG->SelectNodeTo(
             N, State, MVT::Other, CurDAG->getTargetConstant(Reg, DL, MVT::i32),
             CurDAG->getTargetConstant(Immed, DL, MVT::i16), N->getOperand(0));
@@ -4173,8 +4169,7 @@ bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
 
   SDValue IRG_SP = N->getOperand(2);
   if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
-      cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
-          Intrinsic::aarch64_irg_sp) {
+      IRG_SP->getConstantOperandVal(1) != Intrinsic::aarch64_irg_sp) {
     return false;
   }
 
@@ -4183,7 +4178,7 @@ bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
   int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
   SDValue FiOp = CurDAG->getTargetFrameIndex(
       FI, TLI->getPointerTy(CurDAG->getDataLayout()));
-  int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  int TagOffset = N->getConstantOperandVal(3);
 
   SDNode *Out = CurDAG->getMachineNode(
       AArch64::TAGPstack, DL, MVT::i64,
@@ -4203,7 +4198,7 @@ void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
 
   // General case for unrelated pointers in Op1 and Op2.
   SDLoc DL(N);
-  int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  int TagOffset = N->getConstantOperandVal(3);
   SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
                                       {N->getOperand(1), N->getOperand(2)});
   SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
@@ -4219,7 +4214,7 @@ bool AArch64DAGToDAGISel::trySelectCastFixedLengthToScalableVector(SDNode *N) {
   assert(N->getOpcode() == ISD::INSERT_SUBVECTOR && "Invalid Node!");
 
   // Bail when not a "cast" like insert_subvector.
-  if (cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() != 0)
+  if (N->getConstantOperandVal(2) != 0)
     return false;
   if (!N->getOperand(0).isUndef())
     return false;
@@ -4250,7 +4245,7 @@ bool AArch64DAGToDAGISel::trySelectCastScalableToFixedLengthVector(SDNode *N) {
   assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR && "Invalid Node!");
 
   // Bail when not a "cast" like extract_subvector.
-  if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 0)
+  if (N->getConstantOperandVal(1) != 0)
     return false;
 
   // Bail when normal isel can do the job.
@@ -4422,7 +4417,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
     default:
       break;
@@ -5179,7 +5174,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     }
   } break;
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(0);
     switch (IntNo) {
     default:
       break;
@@ -5782,7 +5777,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
   }
   case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     if (Node->getNumOperands() >= 3)
       VT = Node->getOperand(2)->getValueType(0);
     switch (IntNo) {
@@ -6806,7 +6801,7 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
   if (Opcode != ISD::INTRINSIC_VOID && Opcode != ISD::INTRINSIC_W_CHAIN)
     return EVT();
 
-  switch (cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue()) {
+  switch (Root->getConstantOperandVal(1)) {
   default:
     return EVT();
   case Intrinsic::aarch64_sme_ldr:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 50658a855cfb3..26f013a18f382 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2196,7 +2196,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
   }
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IntNo = Op.getConstantOperandVal(0);
     switch (IntNo) {
     default:
       break;
@@ -3922,9 +3922,9 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 // 4: bool isDataCache
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
-  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned IsWrite = Op.getConstantOperandVal(2);
+  unsigned Locality = Op.getConstantOperandVal(3);
+  unsigned IsData = Op.getConstantOperandVal(4);
 
   bool IsStream = !Locality;
   // When the locality number is set
@@ -4973,10 +4973,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     SDValue Chain = Op.getOperand(0);
     SDValue Addr = Op.getOperand(2);
 
-    unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-    unsigned Locality = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
-    unsigned IsStream = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
-    unsigned IsData = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned IsWrite = Op.getConstantOperandVal(3);
+    unsigned Locality = Op.getConstantOperandVal(4);
+    unsigned IsStream = Op.getConstantOperandVal(5);
+    unsigned IsData = Op.getConstantOperandVal(6);
     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
                      (!IsData << 3) |    // IsDataCache bit
                      (Locality << 1) |   // Cache level bits
@@ -5039,7 +5039,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
@@ -5218,8 +5218,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::aarch64_sve_ptrue:
-    return getPTrue(DAG, dl, Op.getValueType(),
-                    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+    return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
   case Intrinsic::aarch64_sve_clz:
     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -6478,7 +6477,7 @@ static unsigned getIntrinsicID(const SDNode *N) {
   default:
     return Intrinsic::not_intrinsic;
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IID = N->getConstantOperandVal(0);
     if (IID < Intrinsic::num_intrinsics)
       return IID;
     return Intrinsic::not_intrinsic;
@@ -10009,7 +10008,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDValue FrameAddr =
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
   while (Depth--)
@@ -10076,7 +10075,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDValue ReturnAddress;
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -10942,7 +10941,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
     // Update the minimum and maximum lane number seen.
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    unsigned EltNo = V.getConstantOperandVal(1);
     Source->MinElt = std::min(Source->MinElt, EltNo);
     Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
@@ -13329,7 +13328,7 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
          "Only cases that extract a fixed length vector are supported!");
 
   EVT InVT = Op.getOperand(0).getValueType();
-  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Idx = Op.getConstantOperandVal(1);
   unsigned Size = Op.getValueSizeInBits();
 
   // If we don't have legal types yet, do nothing
@@ -13375,7 +13374,7 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
          "Only expect to lower inserts into scalable vectors!");
 
   EVT InVT = Op.getOperand(1).getValueType();
-  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Idx = Op.getConstantOperandVal(2);
 
   SDValue Vec0 = Op.getOperand(0);
   SDValue Vec1 = Op.getOperand(1);
@@ -18398,8 +18397,8 @@ static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
   // TODO: we want the operands of the Cmp not the csel
   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
   SetCCInfo.IsAArch64 = true;
-  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
-      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+  SetCCInfo.Info.AArch64.CC =
+      static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
 
   // Check that the operands matches the constraints:
   // (1) Both operands must be constants.
@@ -21584,7 +21583,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
     bool IsDupOp = false;
     unsigned NewOpc = 0;
     unsigned NumVecs = 0;
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
     default: llvm_unreachable("unexpected intrinsic for Neon base update");
     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
@@ -22500,7 +22499,7 @@ static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
 static SDValue performTBZCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  SelectionDAG &DAG) {
-  unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  unsigned Bit = N->getConstantOperandVal(2);
   bool Invert = false;
   SDValue TestSrc = N->getOperand(1);
   SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
@@ -23788,7 +23787,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performMULLCombine(N, DCI, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
       return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
     case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
@@ -23939,8 +23938,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
     case Intrinsic::aarch64_rndr:
     case Intrinsic::aarch64_rndrrs: {
-      unsigned IntrinsicID =
-          cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      unsigned IntrinsicID = N->getConstantOperandVal(1);
       auto Register =
           (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
                                                   : AArch64SysReg::RNDRRS);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b0eac567ec9f1..28604af2cdb3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -377,7 +377,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
   }
   case AMDGPU::REG_SEQUENCE: {
-    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned RCID = N->getConstantOperandVal(0);
     const TargetRegisterClass *SuperRC =
         Subtarget->getRegisterInfo()->getRegClass(RCID);
 
@@ -2672,7 +2672,7 @@ void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
-  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntrID = N->getConstantOperandVal(1);
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume: {
@@ -2690,7 +2690,7 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
-  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntrID = N->getConstantOperandVal(0);
   unsigned Opcode;
   switch (IntrID) {
   case Intrinsic::amdgcn_wqm:
@@ -2731,7 +2731,7 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
-  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntrID = N->getConstantOperandVal(1);
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 541a5b62450dd..8fbc90a6db9fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -682,7 +682,7 @@ static bool hasSourceMods(const SDNode *N) {
   case ISD::BITCAST:
     return false;
   case ISD::INTRINSIC_WO_CHAIN: {
-    switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+    switch (N->getConstantOperandVal(0)) {
     case Intrinsic::amdgcn_interp_p1:
     case Intrinsic::amdgcn_interp_p2:
     case Intrinsic::amdgcn_interp_mov:
@@ -837,7 +837,7 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
   case ISD::TokenFactor:
     return true;
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IntrID = N->getConstantOperandVal(0);
     switch (IntrID) {
     case Intrinsic::amdgcn_readfirstlane:
     case Intrinsic::amdgcn_readlane:
@@ -1489,7 +1489,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                      SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SmallVector<SDValue, 8> Args;
-  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Start = Op.getConstantOperandVal(1);
   EVT VT = Op.getValueType();
   EVT SrcVT = Op.getOperand(0).getValueType();
 
@@ -2502,8 +2502,7 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
   case ISD::FFREXP:
     return true;
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID =
-        cast<ConstantSDNode>(Src.getOperand(0))->getZExtValue();
+    unsigned IntrinsicID = Src.getConstantOperandVal(0);
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_frexp_mant:
       return true;
@@ -3601,7 +3600,7 @@ static SDValue simplifyMul24(SDNode *Node24,
   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
   unsigned NewOpcode = Node24->getOpcode();
   if (IsIntrin) {
-    unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
+    unsigned IID = Node24->getConstantOperandVal(0);
     switch (IID) {
     case Intrinsic::amdgcn_mul_i24:
       NewOpcode = AMDGPUISD::MUL_I24;
@@ -3821,7 +3820,7 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
 
 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
   SDNode *N, DAGCombinerInfo &DCI) const {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(0);
   switch (IID) {
   case Intrinsic::amdgcn_mul_i24:
   case Intrinsic::amdgcn_mul_u24:
@@ -5652,7 +5651,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IID = Op.getConstantOperandVal(0);
     switch (IID) {
     case Intrinsic::amdgcn_workitem_id_x:
     case Intrinsic::amdgcn_workitem_id_y:
@@ -5834,8 +5833,7 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
     return SNaN;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID
-      = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IntrinsicID = Op.getConstantOperandVal(0);
     // TODO: Handle more intrinsics
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_cubeid:
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index c1ba9c514874e..9a2fb0bc37b2c 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -424,8 +424,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     return lowerADDRSPACECAST(Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
-    unsigned IntrinsicID =
-                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    unsigned IntrinsicID = Op.getConstantOperandVal(1);
     switch (IntrinsicID) {
     case Intrinsic::r600_store_swizzle: {
       SDLoc DL(Op);
@@ -449,8 +448,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID =
-                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IntrinsicID = Op.getConstantOperandVal(0);
     EVT VT = Op.getValueType();
     SDLoc DL(Op);
     switch (IntrinsicID) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f1002c738565d..0e857e6ac71b6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5389,7 +5389,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
 
     // Get the rounding mode from the last operand
-    int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    int RoundMode = Op.getConstantOperandVal(1);
     if (RoundMode == (int)RoundingMode::TowardPositive)
       Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
     else if (RoundMode == (int)RoundingMode::TowardNegative)
@@ -5699,7 +5699,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IID = N->getConstantOperandVal(0);
     switch (IID) {
     case Intrinsic::amdgcn_make_buffer_rsrc:
       Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
@@ -5837,7 +5837,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
-    switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+    switch (Intr->getConstantOperandVal(1)) {
     case Intrinsic::amdgcn_if:
       return AMDGPUISD::IF;
     case Intrinsic::amdgcn_else:
@@ -5986,7 +5986,7 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
   // Checking the depth
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
+  if (Op.getConstantOperandVal(0) != 0)
     return DAG.getConstant(0, DL, VT);
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -7635,7 +7635,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntrinsicID = Op.getConstantOperandVal(0);
 
   // TODO: Should this propagate fast-math-flags?
 
@@ -7789,7 +7789,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
                            SDLoc(Op), MVT::i32);
   case Intrinsic::amdgcn_s_buffer_load: {
-    unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned CPol = Op.getConstantOperandVal(3);
     if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
                      ? AMDGPU::CPol::ALL
                      : AMDGPU::CPol::ALL_pregfx12))
@@ -8039,7 +8039,7 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
 
 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned IntrID = Op.getConstantOperandVal(1);
   SDLoc DL(Op);
 
   switch (IntrID) {
@@ -8135,8 +8135,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_buffer_load:
   case Intrinsic::amdgcn_buffer_load_format: {
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned Glc = Op.getConstantOperandVal(5);
+    unsigned Slc = Op.getConstantOperandVal(6);
     unsigned IdxEn = getIdxEn(Op.getOperand(3));
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -8224,10 +8224,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     EVT LoadVT = Op.getValueType();
 
     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
-    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
-    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
+    unsigned Dfmt = Op.getConstantOperandVal(7);
+    unsigned Nfmt = Op.getConstantOperandVal(8);
+    unsigned Glc = Op.getConstantOperandVal(9);
+    unsigned Slc = Op.getConstantOperandVal(10);
     unsigned IdxEn = getIdxEn(Op.getOperand(3));
     SDValue Ops[] = {
         Op.getOperand(0),                                        // Chain
@@ -8314,7 +8314,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_buffer_atomic_or:
   case Intrinsic::amdgcn_buffer_atomic_xor:
   case Intrinsic::amdgcn_buffer_atomic_fadd: {
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned Slc = Op.getConstantOperandVal(6);
     unsigned IdxEn = getIdxEn(Op.getOperand(4));
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -8475,7 +8475,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
 
   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+    unsigned Slc = Op.getConstantOperandVal(7);
     unsigned IdxEn = getIdxEn(Op.getOperand(5));
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -8879,7 +8879,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned IntrinsicID = Op.getConstantOperandVal(1);
   MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrinsicID) {
@@ -8944,10 +8944,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
     if (IsD16)
       VData = handleD16VData(VData, DAG);
-    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
-    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
+    unsigned Dfmt = Op.getConstantOperandVal(8);
+    unsigned Nfmt = Op.getConstantOperandVal(9);
+    unsigned Glc = Op.getConstantOperandVal(10);
+    unsigned Slc = Op.getConstantOperandVal(11);
     unsigned IdxEn = getIdxEn(Op.getOperand(4));
     SDValue Ops[] = {
       Chain,
@@ -9030,8 +9030,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
     if (IsD16)
       VData = handleD16VData(VData, DAG);
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+    unsigned Glc = Op.getConstantOperandVal(6);
+    unsigned Slc = Op.getConstantOperandVal(7);
     unsigned IdxEn = getIdxEn(Op.getOperand(4));
     SDValue Ops[] = {
       Chain,
@@ -12070,8 +12070,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
     return false;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID
-      = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IntrinsicID = Op.getConstantOperandVal(0);
     // TODO: Handle more intrinsics
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_cvt_pkrtz:
@@ -15009,7 +15008,7 @@ void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   unsigned Opc = Op.getOpcode();
   switch (Opc) {
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IID = Op.getConstantOperandVal(0);
     switch (IID) {
     case Intrinsic::amdgcn_mbcnt_lo:
     case Intrinsic::amdgcn_mbcnt_hi: {
@@ -15252,11 +15251,9 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
   case ISD::CALLSEQ_END:
     return true;
   case ISD::INTRINSIC_WO_CHAIN:
-    return AMDGPU::isIntrinsicSourceOfDivergence(
-        cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+    return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
   case ISD::INTRINSIC_W_CHAIN:
-    return AMDGPU::isIntrinsicSourceOfDivergence(
-        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
   case AMDGPUISD::ATOMIC_CMP_SWAP:
   case AMDGPUISD::ATOMIC_LOAD_FMIN:
   case AMDGPUISD::ATOMIC_LOAD_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2fb3957a1ca9d..aae6f2e842fd1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -273,8 +273,8 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
     // subtract the index by one.
     Offset0Idx -= get(Opc0).NumDefs;
     Offset1Idx -= get(Opc1).NumDefs;
-    Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
-    Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
+    Offset0 = Load0->getConstantOperandVal(Offset0Idx);
+    Offset1 = Load1->getConstantOperandVal(Offset1Idx);
     return true;
   }
 
diff --git a/llvm/lib/Target/ARC/ARCISelLowering.cpp b/llvm/lib/Target/ARC/ARCISelLowering.cpp
index 5d9a366f5ed54..2265f5db6737e 100644
--- a/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -751,7 +751,7 @@ SDValue ARCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  assert(cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0 &&
+  assert(Op.getConstantOperandVal(0) == 0 &&
          "Only support lowering frame addr of current frame.");
   Register FrameReg = ARI.getFrameRegister(MF);
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index a0776296b8ebc..ef02dc9970114 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4499,8 +4499,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
-      unsigned ShOpVal =
-        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShOpVal = DefNode->getConstantOperandVal(2);
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
@@ -4512,8 +4511,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::t2LDRHs:
     case ARM::t2LDRSHs: {
       // Thumb2 mode: lsl only.
-      unsigned ShAmt =
-        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShAmt = DefNode->getConstantOperandVal(2);
       if (ShAmt == 0 || ShAmt == 2)
         Latency = *Latency - 1;
       break;
@@ -4526,8 +4524,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
-      unsigned ShOpVal =
-        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShOpVal = DefNode->getConstantOperandVal(2);
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 984d8d3e0b08c..adc429b61bbcc 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2422,8 +2422,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
 
   SDValue Chain = N->getOperand(0);
-  unsigned Lane =
-    cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  unsigned Lane = N->getConstantOperandVal(Vec0Idx + NumVecs);
   EVT VT = N->getOperand(Vec0Idx).getValueType();
   bool is64BitVector = VT.is64BitVector();
 
@@ -2587,7 +2586,7 @@ void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes,
 
   Ops.push_back(N->getOperand(2)); // vector of base addresses
 
-  int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  int32_t ImmValue = N->getConstantOperandVal(3);
   Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate offset
 
   if (Predicated)
@@ -2622,7 +2621,7 @@ void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
 
   // The shift count
   if (Immediate) {
-    int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+    int32_t ImmValue = N->getConstantOperandVal(3);
     Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count
   } else {
     Ops.push_back(N->getOperand(3));
@@ -2630,7 +2629,7 @@ void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
 
   // The immediate saturation operand, if any
   if (HasSaturationOperand) {
-    int32_t SatOp = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
+    int32_t SatOp = N->getConstantOperandVal(4);
     int SatBit = (SatOp == 64 ? 0 : 1);
     Ops.push_back(getI32Imm(SatBit, Loc));
   }
@@ -2685,7 +2684,7 @@ void ARMDAGToDAGISel::SelectMVE_VSHLC(SDNode *N, bool Predicated) {
   // and then an immediate shift count
   Ops.push_back(N->getOperand(1));
   Ops.push_back(N->getOperand(2));
-  int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  int32_t ImmValue = N->getConstantOperandVal(3);
   Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count
 
   if (Predicated)
@@ -4138,14 +4137,13 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     if (InGlue.getOpcode() == ARMISD::CMPZ) {
       if (InGlue.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) {
         SDValue Int = InGlue.getOperand(0);
-        uint64_t ID = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+        uint64_t ID = Int->getConstantOperandVal(1);
 
         // Handle low-overhead loops.
         if (ID == Intrinsic::loop_decrement_reg) {
           SDValue Elements = Int.getOperand(2);
-          SDValue Size = CurDAG->getTargetConstant(
-            cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl,
-                                 MVT::i32);
+          SDValue Size = CurDAG->getTargetConstant(Int.getConstantOperandVal(3),
+                                                   dl, MVT::i32);
 
           SDValue Args[] = { Elements, Size, Int.getOperand(0) };
           SDNode *LoopDec =
@@ -4715,7 +4713,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
     default:
       break;
@@ -4732,9 +4730,9 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
         Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2);
 
       SmallVector<SDValue, 5> Ops;
-      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(), dl)); /* coproc */
-      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), dl)); /* opc */
-      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(4))->getZExtValue(), dl)); /* CRm */
+      Ops.push_back(getI32Imm(N->getConstantOperandVal(2), dl)); /* coproc */
+      Ops.push_back(getI32Imm(N->getConstantOperandVal(3), dl)); /* opc */
+      Ops.push_back(getI32Imm(N->getConstantOperandVal(4), dl)); /* CRm */
 
       // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded
       // instruction will always be '1111' but it is possible in assembly language to specify
@@ -5181,7 +5179,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(0);
     switch (IntNo) {
     default:
       break;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d00b7853816e1..cf9646a0b81ed 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -4110,7 +4110,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
 SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                           const ARMSubtarget *Subtarget) const {
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
@@ -4289,13 +4289,13 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
     return Op.getOperand(0);
 
   SDLoc dl(Op);
-  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
+  unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
   if (!isRead &&
       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
     // ARMv7 with MP extension has PLDW.
     return Op.getOperand(0);
 
-  unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned isData = Op.getConstantOperandVal(4);
   if (Subtarget->isThumb()) {
     // Invert the bits.
     isRead = ~isRead & 1;
@@ -4800,7 +4800,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
       LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
       !isSignedIntSetCC(CC)) {
-    unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
+    unsigned Mask = LHS.getConstantOperandVal(1);
     auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
     uint64_t RHSV = RHSC->getZExtValue();
     if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
@@ -4823,9 +4823,8 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       isa<ConstantSDNode>(RHS) &&
       cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
       CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
-      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
-    unsigned ShiftAmt =
-      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
+      LHS.getConstantOperandVal(1) < 31) {
+    unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
     SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
                                 DAG.getVTList(MVT::i32, MVT::i32),
                                 LHS.getOperand(0),
@@ -6112,7 +6111,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
@@ -6135,7 +6134,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   Register FrameReg = ARI.getFrameRegister(MF);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
@@ -8221,7 +8220,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
     // Update the minimum and maximum lane number seen.
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    unsigned EltNo = V.getConstantOperandVal(1);
     Source->MinElt = std::min(Source->MinElt, EltNo);
     Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
@@ -9034,7 +9033,7 @@ static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
 
   SDValue Conv =
       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
-  unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Lane = Op.getConstantOperandVal(2);
   unsigned LaneWidth =
       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
   unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
@@ -9097,7 +9096,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
 
   SDValue Conv =
       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
-  unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Lane = Op.getConstantOperandVal(1);
   unsigned LaneWidth =
       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
   SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
@@ -10682,7 +10681,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG) {
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(0);
   unsigned Opc = 0;
   if (IntNo == Intrinsic::arm_smlald)
     Opc = ARMISD::SMLALD;
@@ -14908,7 +14907,7 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C)
       return SDValue();
-    unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned InvMask = N->getConstantOperandVal(2);
     unsigned LSB = llvm::countr_zero(~InvMask);
     unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
     assert(Width <
@@ -15448,8 +15447,7 @@ static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
-  ARMCC::CondCodes Cond =
-      (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
   SDLoc dl(N);
 
   // vcmp X, 0, cc -> vcmpz X, cc
@@ -15794,7 +15792,7 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
   unsigned NewOpc = 0;
   unsigned NumVecs = 0;
   if (Target.isIntrinsic) {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
     default:
       llvm_unreachable("unexpected intrinsic for Neon base update");
@@ -16254,12 +16252,10 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
 
   // For the stores, where there are multiple intrinsics we only actually want
   // to post-inc the last of the them.
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-  if (IntNo == Intrinsic::arm_mve_vst2q &&
-      cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
+  unsigned IntNo = N->getConstantOperandVal(1);
+  if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
     return SDValue();
-  if (IntNo == Intrinsic::arm_mve_vst4q &&
-      cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
+  if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
     return SDValue();
 
   // Search for a use of the address operand that is an increment.
@@ -16381,7 +16377,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     return false;
   unsigned NumVecs = 0;
   unsigned NewOpc = 0;
-  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  unsigned IntNo = VLD->getConstantOperandVal(1);
   if (IntNo == Intrinsic::arm_neon_vld2lane) {
     NumVecs = 2;
     NewOpc = ARMISD::VLD2DUP;
@@ -16397,8 +16393,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 
   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
   // numbers match the load.
-  unsigned VLDLaneNo =
-    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
+  unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
        UI != UE; ++UI) {
     // Ignore uses of the chain result.
@@ -16406,7 +16401,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
       continue;
     SDNode *User = *UI;
     if (User->getOpcode() != ARMISD::VDUPLANE ||
-        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+        VLDLaneNo != User->getConstantOperandVal(1))
       return false;
   }
 
@@ -16479,7 +16474,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
   unsigned EltSize = Op.getScalarValueSizeInBits();
   // The canonical VMOV for a zero vector uses a 32-bit element size.
-  unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Imm = Op.getConstantOperandVal(0);
   unsigned EltBits;
   if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
     EltSize = 8;
@@ -17479,7 +17474,7 @@ static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(0);
   switch (IntNo) {
   default:
     // Don't do anything for most intrinsics.
@@ -17669,7 +17664,7 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
   case Intrinsic::arm_mve_addv: {
     // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
     // which allow PerformADDVecReduce to turn it into VADDLV when possible.
-    bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    bool Unsigned = N->getConstantOperandVal(2);
     unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
   }
@@ -17678,7 +17673,7 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
   case Intrinsic::arm_mve_addlv_predicated: {
     // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
     // which recombines the two outputs into an i64
-    bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    bool Unsigned = N->getConstantOperandVal(2);
     unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
                     (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
                     (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
@@ -18193,7 +18188,7 @@ static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
     return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
+    unsigned IntOp = N.getConstantOperandVal(1);
     if (IntOp != Intrinsic::test_start_loop_iterations &&
         IntOp != Intrinsic::loop_decrement_reg)
       return SDValue();
@@ -18271,7 +18266,7 @@ static SDValue PerformHWLoopCombine(SDNode *N,
   SDLoc dl(Int);
   SelectionDAG &DAG = DCI.DAG;
   SDValue Elements = Int.getOperand(2);
-  unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+  unsigned IntOp = Int->getConstantOperandVal(1);
   assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
           && "expected single br user");
   SDNode *Br = *N->use_begin();
@@ -18305,8 +18300,8 @@ static SDValue PerformHWLoopCombine(SDNode *N,
     DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
     return Res;
   } else {
-    SDValue Size = DAG.getTargetConstant(
-      cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
+    SDValue Size =
+        DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
     SDValue Args[] = { Int.getOperand(0), Elements, Size, };
     SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
                                   DAG.getVTList(MVT::i32, MVT::Other), Args);
@@ -19051,7 +19046,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     case Intrinsic::arm_neon_vld1:
     case Intrinsic::arm_neon_vld1x2:
     case Intrinsic::arm_neon_vld1x3:
diff --git a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 196122e45ab8d..e67a1e2ed5090 100644
--- a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -335,7 +335,7 @@ template <> bool AVRDAGToDAGISel::select<ISD::STORE>(SDNode *N) {
     return false;
   }
 
-  int CST = (int)cast<ConstantSDNode>(BasePtr.getOperand(1))->getZExtValue();
+  int CST = (int)BasePtr.getConstantOperandVal(1);
   SDValue Chain = ST->getChain();
   EVT VT = ST->getValue().getValueType();
   SDLoc DL(N);
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index cd1dcfaea0eb1..d36bfb188ed36 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -298,8 +298,7 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
     SDValue SrcHi =
         DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
                     DAG.getConstant(1, dl, MVT::i16));
-    uint64_t ShiftAmount =
-        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    uint64_t ShiftAmount = N->getConstantOperandVal(1);
     if (ShiftAmount == 16) {
       // Special case these two operations because they appear to be used by the
       // generic codegen parts to lower 32-bit numbers.
@@ -367,7 +366,7 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  uint64_t ShiftAmount = N->getConstantOperandVal(1);
   SDValue Victim = N->getOperand(0);
 
   switch (Op.getOpcode()) {
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 909c7c005735b..d8139958e9fcf 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -193,7 +193,7 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
   default:
     break;
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
     case Intrinsic::bpf_load_byte:
     case Intrinsic::bpf_load_half:
@@ -469,7 +469,7 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
   if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN)
     return;
 
-  unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
+  unsigned IntNo = BaseV->getConstantOperandVal(1);
   uint64_t MaskV = MaskN->getZExtValue();
 
   if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index e3b4a2dc048ab..90f70b83a02d3 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -1219,7 +1219,7 @@ SDValue CSKYTargetLowering::LowerFRAMEADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   Register FrameReg = RI.getFrameRegister(MF);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
@@ -1240,7 +1240,7 @@ SDValue CSKYTargetLowering::LowerRETURNADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index f930015026a5c..eb5c59672224e 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -192,7 +192,7 @@ MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
     return nullptr;
 
   SDLoc dl(IntN);
-  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+  unsigned IntNo = IntN->getConstantOperandVal(1);
 
   static std::map<unsigned,unsigned> LoadPciMap = {
     { Intrinsic::hexagon_circ_ldb,  Hexagon::L2_loadrb_pci  },
@@ -284,18 +284,18 @@ bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
   // can provide an address of an unsigned variable to store the result of
   // a sign-extending intrinsic into (or the other way around).
   ISD::LoadExtType IntExt;
-  switch (cast<ConstantSDNode>(C->getOperand(1))->getZExtValue()) {
-    case Intrinsic::hexagon_circ_ldub:
-    case Intrinsic::hexagon_circ_lduh:
-      IntExt = ISD::ZEXTLOAD;
-      break;
-    case Intrinsic::hexagon_circ_ldw:
-    case Intrinsic::hexagon_circ_ldd:
-      IntExt = ISD::NON_EXTLOAD;
-      break;
-    default:
-      IntExt = ISD::SEXTLOAD;
-      break;
+  switch (C->getConstantOperandVal(1)) {
+  case Intrinsic::hexagon_circ_ldub:
+  case Intrinsic::hexagon_circ_lduh:
+    IntExt = ISD::ZEXTLOAD;
+    break;
+  case Intrinsic::hexagon_circ_ldw:
+  case Intrinsic::hexagon_circ_ldd:
+    IntExt = ISD::NON_EXTLOAD;
+    break;
+  default:
+    IntExt = ISD::SEXTLOAD;
+    break;
   }
   if (N->getExtensionType() != IntExt)
     return false;
@@ -325,7 +325,7 @@ bool HexagonDAGToDAGISel::SelectBrevLdIntrinsic(SDNode *IntN) {
     return false;
 
   const SDLoc &dl(IntN);
-  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+  unsigned IntNo = IntN->getConstantOperandVal(1);
 
   static const std::map<unsigned, unsigned> LoadBrevMap = {
     { Intrinsic::hexagon_L2_loadrb_pbr, Hexagon::L2_loadrb_pbr },
@@ -366,7 +366,7 @@ bool HexagonDAGToDAGISel::SelectNewCircIntrinsic(SDNode *IntN) {
     return false;
 
   SDLoc DL(IntN);
-  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+  unsigned IntNo = IntN->getConstantOperandVal(1);
   SmallVector<SDValue, 7> Ops;
 
   static std::map<unsigned,unsigned> LoadNPcMap = {
@@ -641,7 +641,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
   if (SelectNewCircIntrinsic(N))
     return;
 
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(1);
   if (IntNo == Intrinsic::hexagon_V6_vgathermw ||
       IntNo == Intrinsic::hexagon_V6_vgathermw_128B ||
       IntNo == Intrinsic::hexagon_V6_vgathermh ||
@@ -665,7 +665,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
 }
 
 void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(0);
   unsigned Bits;
   switch (IID) {
   case Intrinsic::hexagon_S2_vsplatrb:
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index e08566718d7cd..fb156f2583e8a 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -2895,7 +2895,7 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
   SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32);
 
   unsigned Opcode;
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(1);
   switch (IntNo) {
   default:
     llvm_unreachable("Unexpected HVX gather intrinsic.");
@@ -2934,7 +2934,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
   SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32);
 
   unsigned Opcode;
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(1);
   switch (IntNo) {
   default:
     llvm_unreachable("Unexpected HVX gather intrinsic.");
@@ -2963,7 +2963,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
 }
 
 void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(0);
   SDNode *Result;
   switch (IID) {
   case Intrinsic::hexagon_V6_vaddcarry: {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index a7d452e7227d7..51138091f4a55 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -669,8 +669,7 @@ HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
     --NumOps;  // Ignore the flag operand.
 
   for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-    const InlineAsm::Flag Flags(
-        cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue());
+    const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));
     unsigned NumVals = Flags.getNumOperandRegisters();
     ++i;  // Skip the ID value.
 
@@ -729,7 +728,7 @@ SDValue HexagonTargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
 SDValue HexagonTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(1);
   // Lower the hexagon_prefetch builtin to DCFETCH, as above.
   if (IntNo == Intrinsic::hexagon_prefetch) {
     SDValue Addr = Op.getOperand(2);
@@ -1176,7 +1175,7 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
@@ -1198,7 +1197,7 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                                          HRI.getFrameRegister(), VT);
   while (Depth--)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index db416a500f597..665e2d79c83d1 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -2127,7 +2127,7 @@ HexagonTargetLowering::LowerHvxFunnelShift(SDValue Op,
 SDValue
 HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
   const SDLoc &dl(Op);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   SmallVector<SDValue> Ops(Op->ops().begin(), Op->ops().end());
 
   auto Swap = [&](SDValue P) {
@@ -2922,7 +2922,7 @@ SDValue
 HexagonTargetLowering::RemoveTLWrapper(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getOpcode() == HexagonISD::TL_EXTEND ||
          Op.getOpcode() == HexagonISD::TL_TRUNCATE);
-  unsigned Opc = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Opc = Op.getConstantOperandVal(2);
   return DAG.getNode(Opc, SDLoc(Op), ty(Op), Op.getOperand(0));
 }
 
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index cbb5c2b998e27..17d7ffb586f4e 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -1057,7 +1057,7 @@ SDValue LanaiTargetLowering::LowerRETURNADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     const unsigned Offset = -4;
@@ -1080,7 +1080,7 @@ SDValue LanaiTargetLowering::LowerFRAMEADDR(SDValue Op,
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Lanai::FP, VT);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   while (Depth--) {
     const unsigned Offset = -8;
     SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 80853ee319877..e14bbadf9ed22 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -588,7 +588,7 @@ SDValue LoongArchTargetLowering::lowerFRAMEADDR(SDValue Op,
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   int GRLenInBytes = Subtarget.getGRLen() / 8;
 
   while (Depth--) {
@@ -607,7 +607,7 @@ SDValue LoongArchTargetLowering::lowerRETURNADDR(SDValue Op,
     return SDValue();
 
   // Currently only support lowering return address for current frame.
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) {
+  if (Op.getConstantOperandVal(0) != 0) {
     DAG.getContext()->emitError(
         "return address can only be determined for the current frame");
     return SDValue();
@@ -1263,7 +1263,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
     return emitIntrinsicWithChainErrorMessage(Op, ErrorMsgReqLA64, DAG);
   case Intrinsic::loongarch_csrrd_w:
   case Intrinsic::loongarch_csrrd_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(2);
     return !isUInt<14>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : DAG.getNode(LoongArchISD::CSRRD, DL, {GRLenVT, MVT::Other},
@@ -1271,7 +1271,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::loongarch_csrwr_w:
   case Intrinsic::loongarch_csrwr_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(3);
     return !isUInt<14>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : DAG.getNode(LoongArchISD::CSRWR, DL, {GRLenVT, MVT::Other},
@@ -1280,7 +1280,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::loongarch_csrxchg_w:
   case Intrinsic::loongarch_csrxchg_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(4);
     return !isUInt<14>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : DAG.getNode(LoongArchISD::CSRXCHG, DL, {GRLenVT, MVT::Other},
@@ -1306,7 +1306,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                        {Chain, Op.getOperand(2)});
   }
   case Intrinsic::loongarch_lddir_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(3);
     return !isUInt<8>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : Op;
@@ -1314,7 +1314,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::loongarch_movfcsr2gr: {
     if (!Subtarget.hasBasicF())
       return emitIntrinsicWithChainErrorMessage(Op, ErrorMsgReqF, DAG);
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(2);
     return !isUInt<2>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : DAG.getNode(LoongArchISD::MOVFCSR2GR, DL, {VT, MVT::Other},
@@ -1460,7 +1460,7 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
     ASRT_LE_GT_CASE(asrtgt_d)
 #undef ASRT_LE_GT_CASE
   case Intrinsic::loongarch_ldpte_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(3);
     return !Subtarget.is64Bit()
                ? emitIntrinsicErrorMessage(Op, ErrorMsgReqLA64, DAG)
            : !isUInt<8>(Imm) ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
@@ -1473,53 +1473,53 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
                : SDValue();
   case Intrinsic::loongarch_lasx_xvstelm_b:
     return (!isInt<8>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<5>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<5>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
                : SDValue();
   case Intrinsic::loongarch_lsx_vstelm_b:
     return (!isInt<8>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<4>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<4>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
                : SDValue();
   case Intrinsic::loongarch_lasx_xvstelm_h:
     return (!isShiftedInt<8, 1>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<4>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<4>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 2", DAG)
                : SDValue();
   case Intrinsic::loongarch_lsx_vstelm_h:
     return (!isShiftedInt<8, 1>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<3>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<3>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 2", DAG)
                : SDValue();
   case Intrinsic::loongarch_lasx_xvstelm_w:
     return (!isShiftedInt<8, 2>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<3>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<3>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 4", DAG)
                : SDValue();
   case Intrinsic::loongarch_lsx_vstelm_w:
     return (!isShiftedInt<8, 2>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<2>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<2>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 4", DAG)
                : SDValue();
   case Intrinsic::loongarch_lasx_xvstelm_d:
     return (!isShiftedInt<8, 3>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<2>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<2>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 8", DAG)
                : SDValue();
   case Intrinsic::loongarch_lsx_vstelm_d:
     return (!isShiftedInt<8, 3>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<1>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<1>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 8", DAG)
                : SDValue();
@@ -1692,7 +1692,7 @@ replaceVPICKVE2GRResults(SDNode *Node, SmallVectorImpl<SDValue> &Results,
                          SelectionDAG &DAG, const LoongArchSubtarget &Subtarget,
                          unsigned ResOp) {
   const StringRef ErrorMsgOOR = "argument out of range";
-  unsigned Imm = cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue();
+  unsigned Imm = Node->getConstantOperandVal(2);
   if (!isUInt<N>(Imm)) {
     emitErrorAndReplaceIntrinsicResults(Node, Results, DAG, ErrorMsgOOR,
                                         /*WithChain=*/false);
@@ -1995,7 +1995,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
       break;
     }
     case Intrinsic::loongarch_csrwr_w: {
-      unsigned Imm = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+      unsigned Imm = N->getConstantOperandVal(3);
       if (!isUInt<14>(Imm)) {
         emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
         return;
@@ -2010,7 +2010,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
       break;
     }
     case Intrinsic::loongarch_csrxchg_w: {
-      unsigned Imm = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
+      unsigned Imm = N->getConstantOperandVal(4);
       if (!isUInt<14>(Imm)) {
         emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
         return;
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index f42882dafa095..c4d7a0dec7f39 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -2278,8 +2278,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       isNullConstant(Cond.getOperand(1).getOperand(0))) {
     SDValue Cmp = Cond.getOperand(1);
 
-    unsigned CondCode =
-        cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+    unsigned CondCode = Cond.getConstantOperandVal(0);
 
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == M68k::COND_EQ || CondCode == M68k::COND_NE)) {
@@ -3388,7 +3387,7 @@ SDValue M68kTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Align = Op.getConstantOperandVal(2);
   EVT VT = Node->getValueType(0);
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index ee7762c296bf5..d3b59138a5a95 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -964,7 +964,7 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
   if (!isa<ConstantSDNode>(N->getOperand(1)))
     return Op;
 
-  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  uint64_t ShiftAmount = N->getConstantOperandVal(1);
 
   // Expand the stuff into sequence of shifts.
   SDValue Victim = N->getOperand(0);
@@ -1269,7 +1269,7 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();
 
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   EVT PtrVT = Op.getValueType();
 
@@ -1295,7 +1295,7 @@ SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                                          MSP430::R4, VT);
   while (Depth--)
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index a0cab80243868..483eba4e4f479 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -2508,7 +2508,7 @@ SDValue MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
 SDValue MipsTargetLowering::
 lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   // check the depth
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) {
+  if (Op.getConstantOperandVal(0) != 0) {
     DAG.getContext()->emitError(
         "return address can be determined only for current frame");
     return SDValue();
@@ -2529,7 +2529,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
     return SDValue();
 
   // check the depth
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) {
+  if (Op.getConstantOperandVal(0) != 0) {
     DAG.getContext()->emitError(
         "return address can be determined only for current frame");
     return SDValue();
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 8c865afd42079..0ed87ee0809a3 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -831,8 +831,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_W_CHAIN: {
-    const unsigned IntrinsicOpcode =
-        cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    const unsigned IntrinsicOpcode = Node->getConstantOperandVal(1);
     switch (IntrinsicOpcode) {
     default:
       break;
@@ -885,7 +884,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_WO_CHAIN: {
-    switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
+    switch (Node->getConstantOperandVal(0)) {
     default:
       break;
 
@@ -901,8 +900,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_VOID: {
-    const unsigned IntrinsicOpcode =
-        cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    const unsigned IntrinsicOpcode = Node->getConstantOperandVal(1);
     switch (IntrinsicOpcode) {
     default:
       break;
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 5c34067c88889..f6ac41ed3479c 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1528,7 +1528,7 @@ static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
 SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  unsigned Intrinsic = cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue();
+  unsigned Intrinsic = Op->getConstantOperandVal(0);
   switch (Intrinsic) {
   default:
     return SDValue();
@@ -2300,7 +2300,7 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
 
 SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  unsigned Intr = Op->getConstantOperandVal(1);
   switch (Intr) {
   default:
     return SDValue();
@@ -2375,7 +2375,7 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
 
 SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  unsigned Intr = Op->getConstantOperandVal(1);
   switch (Intr) {
   default:
     return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 894a8636f4585..815c46edb6fa2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -513,7 +513,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
 }
 
 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(1);
   switch (IID) {
   default:
     return false;
@@ -730,7 +730,7 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
 }
 
 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(0);
   switch (IID) {
   default:
     return false;
@@ -1246,7 +1246,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     Op1 = N->getOperand(2);
     Mem = cast<MemIntrinsicSDNode>(N);
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IID = N->getConstantOperandVal(1);
     switch (IID) {
     default:
       return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index b57d185bb638b..ed96339240d92 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4902,8 +4902,7 @@ bool PPCDAGToDAGISel::trySelectLoopCountIntrinsic(SDNode *N) {
     return false;
 
   if (LHS.getOperand(0).getOpcode() != ISD::INTRINSIC_W_CHAIN ||
-      cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() !=
-          Intrinsic::loop_decrement)
+      LHS.getOperand(0).getConstantOperandVal(1) != Intrinsic::loop_decrement)
     return false;
 
   if (!isa<ConstantSDNode>(RHS))
@@ -6011,7 +6010,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // Op #3 is the Dest MBB
     // Op #4 is the Flag.
     // Prevent PPC::PRED_* from being selected into LI.
-    unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned PCC = N->getConstantOperandVal(1);
     if (EnableBranchHint)
       PCC |= getBranchHint(PCC, *FuncInfo, N->getOperand(3));
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 385b3b74c34d6..8f27e6677afa5 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2817,8 +2817,8 @@ bool PPCTargetLowering::SelectAddressRegImm(
       return true; // [r+i]
     } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
       // Match LOAD (ADD (X, Lo(G))).
-      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
-             && "Cannot handle constant offsets yet!");
+      assert(!N.getOperand(1).getConstantOperandVal(1) &&
+             "Cannot handle constant offsets yet!");
       Disp = N.getOperand(1).getOperand(0);  // The global address.
       assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
              Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
@@ -3824,8 +3824,7 @@ SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
 
   // Check all operands that may contain the LR.
   for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-    const InlineAsm::Flag Flags(
-        cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue());
+    const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));
     unsigned NumVals = Flags.getNumOperandRegisters();
     ++i; // Skip the ID value.
 
@@ -10442,8 +10441,7 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
 /// information about the intrinsic.
 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
                                  bool &isDot, const PPCSubtarget &Subtarget) {
-  unsigned IntrinsicID =
-      cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+  unsigned IntrinsicID = Intrin.getConstantOperandVal(0);
   CompareOpc = -1;
   isDot = false;
   switch (IntrinsicID) {
@@ -10728,8 +10726,7 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
 /// lower, do it, otherwise return null.
 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
-  unsigned IntrinsicID =
-    cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntrinsicID = Op.getConstantOperandVal(0);
 
   SDLoc dl(Op);
 
@@ -10947,7 +10944,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   // Unpack the result based on how the target uses it.
   unsigned BitNo;   // Bit # of CR6.
   bool InvertBit;   // Invert result?
-  switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+  switch (Op.getConstantOperandVal(1)) {
   default:  // Can't happen, don't crash on invalid number though.
   case 0:   // Return the value of the EQ bit of CR6.
     BitNo = 0; InvertBit = false;
@@ -10983,7 +10980,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   // the beginning of the argument list.
   int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
   SDLoc DL(Op);
-  switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
+  switch (Op.getConstantOperandVal(ArgStart)) {
   case Intrinsic::ppc_cfence: {
     assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
     SDValue Val = Op.getOperand(ArgStart + 1);
@@ -11548,7 +11545,7 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
 
     // Custom lower is only done for high or low doubleword.
-    int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    int Idx = Op0.getConstantOperandVal(1);
     if (Idx % 2 != 0)
       return SDValue();
 
@@ -11717,8 +11714,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
-        Intrinsic::loop_decrement)
+    if (N->getConstantOperandVal(1) != Intrinsic::loop_decrement)
       break;
 
     assert(N->getValueType(0) == MVT::i1 &&
@@ -11734,7 +11730,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+    switch (N->getConstantOperandVal(0)) {
     case Intrinsic::ppc_pack_longdouble:
       Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
                                     N->getOperand(2), N->getOperand(1)));
@@ -13654,7 +13650,7 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
 
   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     EVT VT;
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     default: return false;
     case Intrinsic::ppc_altivec_lvx:
     case Intrinsic::ppc_altivec_lvxl:
@@ -13682,7 +13678,7 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
 
   if (N->getOpcode() == ISD::INTRINSIC_VOID) {
     EVT VT;
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     default: return false;
     case Intrinsic::ppc_altivec_stvx:
     case Intrinsic::ppc_altivec_stvxl:
@@ -15546,8 +15542,7 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
 }
 
 static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
-  unsigned IntrinsicID =
-      cast<ConstantSDNode>(Intrin.getOperand(1))->getZExtValue();
+  unsigned IntrinsicID = Intrin.getConstantOperandVal(1);
   if (IntrinsicID == Intrinsic::ppc_stdcx)
     StoreWidth = 8;
   else if (IntrinsicID == Intrinsic::ppc_stwcx)
@@ -15979,7 +15974,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     break;
     case ISD::INTRINSIC_WO_CHAIN: {
       bool isLittleEndian = Subtarget.isLittleEndian();
-      unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+      unsigned IID = N->getConstantOperandVal(0);
       Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
                                            : Intrinsic::ppc_altivec_lvsl);
       if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) {
@@ -15992,36 +15987,34 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                       .zext(Add.getScalarValueSizeInBits()))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
           for (SDNode *U : BasePtr->uses()) {
-            if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-                cast<ConstantSDNode>(U->getOperand(0))->getZExtValue() == IID) {
-              // We've found another LVSL/LVSR, and this address is an aligned
-              // multiple of that one. The results will be the same, so use the
-              // one we've just found instead.
+          if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+              U->getConstantOperandVal(0) == IID) {
+            // We've found another LVSL/LVSR, and this address is an aligned
+            // multiple of that one. The results will be the same, so use the
+            // one we've just found instead.
 
-              return SDValue(U, 0);
-            }
+            return SDValue(U, 0);
+          }
           }
         }
 
         if (isa<ConstantSDNode>(Add->getOperand(1))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
           for (SDNode *U : BasePtr->uses()) {
-            if (U->getOpcode() == ISD::ADD &&
-                isa<ConstantSDNode>(U->getOperand(1)) &&
-                (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
-                 cast<ConstantSDNode>(U->getOperand(1))->getZExtValue()) %
-                        (1ULL << Bits) ==
-                    0) {
-              SDNode *OtherAdd = U;
-              for (SDNode *V : OtherAdd->uses()) {
-                if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-                    cast<ConstantSDNode>(V->getOperand(0))->getZExtValue() ==
-                        IID) {
-                  return SDValue(V, 0);
-                }
+          if (U->getOpcode() == ISD::ADD &&
+              isa<ConstantSDNode>(U->getOperand(1)) &&
+              (Add->getConstantOperandVal(1) - U->getConstantOperandVal(1)) %
+                      (1ULL << Bits) ==
+                  0) {
+            SDNode *OtherAdd = U;
+            for (SDNode *V : OtherAdd->uses()) {
+              if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                  V->getConstantOperandVal(0) == IID) {
+                return SDValue(V, 0);
               }
             }
           }
+          }
         }
       }
 
@@ -16061,30 +16054,30 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
-    default:
-      break;
-    case Intrinsic::ppc_altivec_vsum4sbs:
-    case Intrinsic::ppc_altivec_vsum4shs:
-    case Intrinsic::ppc_altivec_vsum4ubs: {
-      // These sum-across intrinsics only have a chain due to the side effect
-      // that they may set the SAT bit. If we know the SAT bit will not be set
-      // for some inputs, we can replace any uses of their chain with the input
-      // chain.
-      if (BuildVectorSDNode *BVN =
-              dyn_cast<BuildVectorSDNode>(N->getOperand(3))) {
-        APInt APSplatBits, APSplatUndef;
-        unsigned SplatBitSize;
-        bool HasAnyUndefs;
-        bool BVNIsConstantSplat = BVN->isConstantSplat(
-            APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0,
-            !Subtarget.isLittleEndian());
-        // If the constant splat vector is 0, the SAT bit will not be set.
-        if (BVNIsConstantSplat && APSplatBits == 0)
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0));
+      switch (N->getConstantOperandVal(1)) {
+      default:
+        break;
+      case Intrinsic::ppc_altivec_vsum4sbs:
+      case Intrinsic::ppc_altivec_vsum4shs:
+      case Intrinsic::ppc_altivec_vsum4ubs: {
+        // These sum-across intrinsics only have a chain due to the side effect
+        // that they may set the SAT bit. If we know the SAT bit will not be set
+        // for some inputs, we can replace any uses of their chain with the
+        // input chain.
+        if (BuildVectorSDNode *BVN =
+                dyn_cast<BuildVectorSDNode>(N->getOperand(3))) {
+          APInt APSplatBits, APSplatUndef;
+          unsigned SplatBitSize;
+          bool HasAnyUndefs;
+          bool BVNIsConstantSplat = BVN->isConstantSplat(
+              APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0,
+              !Subtarget.isLittleEndian());
+          // If the constant splat vector is 0, the SAT bit will not be set.
+          if (BVNIsConstantSplat && APSplatBits == 0)
+            DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0));
+        }
+        return SDValue();
       }
-      return SDValue();
-    }
     case Intrinsic::ppc_vsx_lxvw4x:
     case Intrinsic::ppc_vsx_lxvd2x:
       // For little endian, VSX loads require generating lxvd2x/xxswapd.
@@ -16098,7 +16091,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     // For little endian, VSX stores require generating xxswapd/stxvd2x.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
     if (Subtarget.needsSwapsForVSXMemOps()) {
-      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+      switch (N->getConstantOperandVal(1)) {
       default:
         break;
       case Intrinsic::ppc_vsx_stxvw4x:
@@ -16327,7 +16320,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
       // Unpack the result based on how the target uses it.
       PPC::Predicate CompOpc;
-      switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
+      switch (LHS.getConstantOperandVal(1)) {
       default:  // Can't happen, don't crash on invalid number though.
       case 0:   // Branch on the value of the EQ bit of CR6.
         CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
@@ -16406,7 +16399,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    switch (Op.getConstantOperandVal(0)) {
     default: break;
     case Intrinsic::ppc_altivec_vcmpbfp_p:
     case Intrinsic::ppc_altivec_vcmpeqfp_p:
@@ -16433,7 +16426,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+    switch (Op.getConstantOperandVal(1)) {
     default:
       break;
     case Intrinsic::ppc_load2r:
@@ -16868,7 +16861,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
     return SDValue();
 
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
 
   // Make sure the function does not optimize away the store of the RA to
   // the stack.
@@ -16901,7 +16894,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -18086,8 +18079,7 @@ static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
         FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.
       else
         FlagSet |= PPC::MOF_RPlusR; // Register.
-    } else if (RHS.getOpcode() == PPCISD::Lo &&
-               !cast<ConstantSDNode>(RHS.getOperand(1))->getZExtValue())
+    } else if (RHS.getOpcode() == PPCISD::Lo && !RHS.getConstantOperandVal(1))
       FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.
     else
       FlagSet |= PPC::MOF_RPlusR;
@@ -18131,7 +18123,7 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
   unsigned ParentOp = Parent->getOpcode();
   if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
                                (ParentOp == ISD::INTRINSIC_VOID))) {
-    unsigned ID = cast<ConstantSDNode>(Parent->getOperand(1))->getZExtValue();
+    unsigned ID = Parent->getConstantOperandVal(1);
     if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {
       SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)
                              ? Parent->getOperand(2)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 098a320c91533..bfa3bf3cc74e2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1360,7 +1360,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (N0.getOpcode() != ISD::AND || !isa<ConstantSDNode>(N0.getOperand(1)))
       break;
 
-    uint64_t C2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t C2 = N0.getConstantOperandVal(1);
 
     // Constant should be a mask.
     if (!isMask_64(C2))
@@ -1604,7 +1604,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
       // By default we do not custom select any intrinsic.
     default:
@@ -1825,7 +1825,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     break;
   }
   case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
     case Intrinsic::riscv_vsseg2:
     case Intrinsic::riscv_vsseg3:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 51580d15451ca..03a59f8a8b57c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7235,7 +7235,7 @@ SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   while (Depth--) {
     int Offset = -(XLenInBytes * 2);
     SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
@@ -7260,7 +7260,7 @@ SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     int Off = -XLenInBytes;
     SDValue FrameAddr = lowerFRAMEADDR(Op, DAG);
@@ -11731,7 +11731,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(0);
     switch (IntNo) {
     default:
       llvm_unreachable(
@@ -14153,7 +14153,7 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
     for (SDNode *U : N0->uses()) {
       if (U->getOpcode() != ISD::SRA ||
           !isa<ConstantSDNode>(U->getOperand(1)) ||
-          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() > 32)
+          U->getConstantOperandVal(1) > 32)
         return SDValue();
     }
 
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 4f08014792110..78bdf3ae9a84b 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2050,7 +2050,7 @@ static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
          LHS.getOperand(3).getOpcode() == SPISD::CMPFCC_V9))) &&
       isOneConstant(LHS.getOperand(0)) && isNullConstant(LHS.getOperand(1))) {
     SDValue CMPCC = LHS.getOperand(3);
-    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    SPCC = LHS.getConstantOperandVal(2);
     LHS = CMPCC.getOperand(0);
     RHS = CMPCC.getOperand(1);
   }
@@ -3186,7 +3186,7 @@ static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
 
 SDValue SparcTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 559f2ca476d70..045c4c0aac07a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -2186,7 +2186,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 // the mask of valid CC values if so.
 static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
                                       unsigned &CCValid) {
-  unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Id = Op.getConstantOperandVal(1);
   switch (Id) {
   case Intrinsic::s390_tbegin:
     Opcode = SystemZISD::TBEGIN;
@@ -2212,7 +2212,7 @@ static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
 // CC value as its final argument.  Provide the associated SystemZISD
 // opcode and the mask of valid CC values if so.
 static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
-  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Id = Op.getConstantOperandVal(0);
   switch (Id) {
   case Intrinsic::s390_vpkshs:
   case Intrinsic::s390_vpksfs:
@@ -2600,10 +2600,9 @@ static bool shouldSwapCmpOperands(const Comparison &C) {
     return true;
   if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
     return true;
-  if (C.ICmpType != SystemZICMP::SignedOnly &&
-      Opcode0 == ISD::AND &&
+  if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::AND &&
       C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
+      C.Op0.getConstantOperandVal(1) == 0xffffffff)
     return true;
 
   return false;
@@ -3429,11 +3428,9 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
   return (Neg.getOpcode() == ISD::SUB &&
           Neg.getOperand(0).getOpcode() == ISD::Constant &&
-          cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
-          Neg.getOperand(1) == Pos &&
-          (Pos == CmpOp ||
-           (Pos.getOpcode() == ISD::SIGN_EXTEND &&
-            Pos.getOperand(0) == CmpOp)));
+          Neg.getConstantOperandVal(0) == 0 && Neg.getOperand(1) == Pos &&
+          (Pos == CmpOp || (Pos.getOpcode() == ISD::SIGN_EXTEND &&
+                            Pos.getOperand(0) == CmpOp)));
 }
 
 // Return the absolute or negative absolute of Op; IsNegative decides which.
@@ -3740,7 +3737,7 @@ SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
   MFI.setFrameAddressIsTaken(true);
 
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // By definition, the frame address is the address of the back chain.  (In
@@ -3776,7 +3773,7 @@ SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
     return SDValue();
 
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Depth > 0) {
@@ -4226,7 +4223,7 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   if (HighOp.getOpcode() == ISD::AND &&
       HighOp.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue HighOp0 = HighOp.getOperand(0);
-    uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
+    uint64_t Mask = HighOp.getConstantOperandVal(1);
     if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
       HighOp = HighOp0;
   }
@@ -4485,10 +4482,10 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
 SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
-    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
-  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
-    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+  AtomicOrdering FenceOrdering =
+      static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
+  SyncScope::ID FenceSSID =
+      static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
 
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
@@ -4773,13 +4770,13 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
 
 SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
                                              SelectionDAG &DAG) const {
-  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  bool IsData = Op.getConstantOperandVal(4);
   if (!IsData)
     // Just preserve the chain.
     return Op.getOperand(0);
 
   SDLoc DL(Op);
-  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  bool IsWrite = Op.getConstantOperandVal(2);
   unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
   auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
   SDValue Ops[] = {Op.getOperand(0), DAG.getTargetConstant(Code, DL, MVT::i32),
@@ -4825,7 +4822,7 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                        SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
   }
 
-  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Id = Op.getConstantOperandVal(0);
   switch (Id) {
   case Intrinsic::thread_pointer:
     return lowerThreadPointer(SDLoc(Op), DAG);
@@ -5628,7 +5625,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
       Op = Op.getOperand(0);
     if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op.getOperand(1).getOpcode() == ISD::Constant) {
-      unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      unsigned Elem = Op.getConstantOperandVal(1);
       if (!GS.add(Op.getOperand(0), Elem))
         return SDValue();
       FoundOne = true;
@@ -6727,8 +6724,7 @@ SDValue SystemZTargetLowering::combineLOAD(
       int Index = 1;
       if (User->getOpcode() == ISD::SRL &&
           User->getOperand(1).getOpcode() == ISD::Constant &&
-          cast<ConstantSDNode>(User->getOperand(1))->getZExtValue() == 64 &&
-          User->hasOneUse()) {
+          User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
         User = *User->use_begin();
         Index = 0;
       }
@@ -6857,7 +6853,7 @@ static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
     std::swap(Op0, Op1);
   if (Op1.getOpcode() != ISD::SHL || !Op1.getNode()->hasOneUse() ||
       Op1.getOperand(1).getOpcode() != ISD::Constant ||
-      cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue() != 64)
+      Op1.getConstantOperandVal(1) != 64)
     return false;
   Op1 = Op1.getOperand(0);
 
@@ -7149,20 +7145,18 @@ SDValue SystemZTargetLowering::combineFP_ROUND(
   unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op0 = N->getOperand(OpNo);
-  if (N->getValueType(0) == MVT::f32 &&
-      Op0.hasOneUse() &&
+  if (N->getValueType(0) == MVT::f32 && Op0.hasOneUse() &&
       Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       Op0.getOperand(0).getValueType() == MVT::v2f64 &&
       Op0.getOperand(1).getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+      Op0.getConstantOperandVal(1) == 0) {
     SDValue Vec = Op0.getOperand(0);
     for (auto *U : Vec->uses()) {
-      if (U != Op0.getNode() &&
-          U->hasOneUse() &&
+      if (U != Op0.getNode() && U->hasOneUse() &&
           U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
           U->getOperand(0) == Vec &&
           U->getOperand(1).getOpcode() == ISD::Constant &&
-          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
+          U->getConstantOperandVal(1) == 1) {
         SDValue OtherRound = SDValue(*U->use_begin(), 0);
         if (OtherRound.getOpcode() == N->getOpcode() &&
             OtherRound.getOperand(OpNo) == SDValue(U, 0) &&
@@ -7215,20 +7209,18 @@ SDValue SystemZTargetLowering::combineFP_EXTEND(
   unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op0 = N->getOperand(OpNo);
-  if (N->getValueType(0) == MVT::f64 &&
-      Op0.hasOneUse() &&
+  if (N->getValueType(0) == MVT::f64 && Op0.hasOneUse() &&
       Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       Op0.getOperand(0).getValueType() == MVT::v4f32 &&
       Op0.getOperand(1).getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+      Op0.getConstantOperandVal(1) == 0) {
     SDValue Vec = Op0.getOperand(0);
     for (auto *U : Vec->uses()) {
-      if (U != Op0.getNode() &&
-          U->hasOneUse() &&
+      if (U != Op0.getNode() && U->hasOneUse() &&
           U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
           U->getOperand(0) == Vec &&
           U->getOperand(1).getOpcode() == ISD::Constant &&
-          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 2) {
+          U->getConstantOperandVal(1) == 2) {
         SDValue OtherExtend = SDValue(*U->use_begin(), 0);
         if (OtherExtend.getOpcode() == N->getOpcode() &&
             OtherExtend.getOperand(OpNo) == SDValue(U, 0) &&
@@ -7605,7 +7597,7 @@ SDValue SystemZTargetLowering::combineINTRINSIC(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
-  unsigned Id = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned Id = N->getConstantOperandVal(1);
   switch (Id) {
   // VECTOR LOAD (RIGHTMOST) WITH LENGTH with a length operand of 15
   // or larger is simply a vector load.
@@ -7679,7 +7671,7 @@ static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
   APInt SrcDemE;
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
-    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned Id = Op.getConstantOperandVal(0);
     switch (Id) {
     case Intrinsic::s390_vpksh:   // PACKS
     case Intrinsic::s390_vpksf:
@@ -7723,7 +7715,7 @@ static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
       SrcDemE = APInt(NumElts, 0);
       if (!DemandedElts[OpNo - 1])
         break;
-      unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned Mask = Op.getConstantOperandVal(3);
       unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
       // Demand input element 0 or 1, given by the mask bit value.
       SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
@@ -7732,7 +7724,7 @@ static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
     case Intrinsic::s390_vsldb: {
       // VECTOR SHIFT LEFT DOUBLE BY BYTE
       assert(VT == MVT::v16i8 && "Unexpected type.");
-      unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned FirstIdx = Op.getConstantOperandVal(3);
       assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
       unsigned NumSrc0Els = 16 - FirstIdx;
       SrcDemE = APInt(NumElts, 0);
@@ -7808,7 +7800,7 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
     bool IsLogical = false;
-    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned Id = Op.getConstantOperandVal(0);
     switch (Id) {
     case Intrinsic::s390_vpksh:   // PACKS
     case Intrinsic::s390_vpksf:
@@ -7908,7 +7900,7 @@ SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
     return 1;
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
-    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned Id = Op.getConstantOperandVal(0);
     switch (Id) {
     case Intrinsic::s390_vpksh:   // PACKS
     case Intrinsic::s390_vpksf:
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index af6cf340f8a32..d98bb886c1850 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -507,11 +507,11 @@ def z_subcarry : PatFrag<(ops node:$lhs, node:$rhs),
 
 // Signed and unsigned comparisons.
 def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
-  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  unsigned Type = N->getConstantOperandVal(2);
   return Type != SystemZICMP::UnsignedOnly;
 }]>;
 def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
-  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  unsigned Type = N->getConstantOperandVal(2);
   return Type != SystemZICMP::SignedOnly;
 }]>;
 
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 0267aefd1e914..0e41a2d7aa03e 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -1101,10 +1101,10 @@ Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder,
 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
-      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
-  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
-      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+  AtomicOrdering FenceOrdering =
+      static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
+  SyncScope::ID FenceSSID =
+      static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
 
   // VE uses Release consistency, so need a fence instruction if it is a
   // cross-thread fence.
@@ -1766,7 +1766,7 @@ static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   switch (IntNo) {
   default: // Don't custom lower most intrinsics.
     return SDValue();
@@ -2937,8 +2937,8 @@ static bool isI32Insn(const SDNode *User, const SDNode *N) {
     if (User->getOperand(1).getNode() != N &&
         User->getOperand(2).getNode() != N &&
         isa<ConstantSDNode>(User->getOperand(3))) {
-      VECC::CondCode VECCVal = static_cast<VECC::CondCode>(
-          cast<ConstantSDNode>(User->getOperand(3))->getZExtValue());
+      VECC::CondCode VECCVal =
+          static_cast<VECC::CondCode>(User->getConstantOperandVal(3));
       return isIntVECondCode(VECCVal);
     }
     [[fallthrough]];
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 77a997588c4fe..846eab93e1fea 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5233,7 +5233,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case X86ISD::VPTERNLOG: {
-    uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
+    uint8_t Imm = Node->getConstantOperandVal(3);
     if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
                        Node->getOperand(1), Node->getOperand(2), Imm))
       return;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a90ddf132c389..1e4b1361f98a6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31758,7 +31758,7 @@ static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
-  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned IsData = Op.getConstantOperandVal(4);
 
   // We don't support non-data prefetch without PREFETCHI.
   // Just preserve the chain.
diff --git a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 1288597fc6b01..05003ec304adc 100644
--- a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -250,7 +250,7 @@ bool XCoreDAGToDAGISel::tryBRIND(SDNode *N) {
   SDValue Addr = N->getOperand(1);
   if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
     return false;
-  unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
+  unsigned IntNo = Addr->getConstantOperandVal(1);
   if (IntNo != Intrinsic::xcore_checkevent)
     return false;
   SDValue nextAddr = Addr->getOperand(2);
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 7736adab19e89..18feeaadb03c8 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -767,7 +767,7 @@ SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
   // An index of zero corresponds to the current function's frame address.
   // An index of one to the parent's frame address, and so on.
   // Depths > 0 not supported yet!
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+  if (Op.getConstantOperandVal(0) > 0)
     return SDValue();
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -783,7 +783,7 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   // An index of zero corresponds to the current function's return address.
   // An index of one to the parent's return address, and so on.
   // Depths > 0 not supported yet!
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+  if (Op.getConstantOperandVal(0) > 0)
     return SDValue();
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -905,7 +905,7 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
 SDValue XCoreTargetLowering::
 LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   switch (IntNo) {
     case Intrinsic::xcore_crc8:
       EVT VT = Op.getValueType();
@@ -1497,7 +1497,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default: break;
   case ISD::INTRINSIC_VOID:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     case Intrinsic::xcore_outt:
     case Intrinsic::xcore_outct:
     case Intrinsic::xcore_chkct: {
@@ -1733,30 +1733,30 @@ void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     break;
   case ISD::INTRINSIC_W_CHAIN:
     {
-      unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      switch (IntNo) {
-      case Intrinsic::xcore_getts:
-        // High bits are known to be zero.
-        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
-                                           Known.getBitWidth() - 16);
-        break;
-      case Intrinsic::xcore_int:
-      case Intrinsic::xcore_inct:
-        // High bits are known to be zero.
-        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
-                                           Known.getBitWidth() - 8);
-        break;
-      case Intrinsic::xcore_testct:
-        // Result is either 0 or 1.
-        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
-                                           Known.getBitWidth() - 1);
-        break;
-      case Intrinsic::xcore_testwct:
-        // Result is in the range 0 - 4.
-        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
-                                           Known.getBitWidth() - 3);
-        break;
-      }
+    unsigned IntNo = Op.getConstantOperandVal(1);
+    switch (IntNo) {
+    case Intrinsic::xcore_getts:
+      // High bits are known to be zero.
+      Known.Zero =
+          APInt::getHighBitsSet(Known.getBitWidth(), Known.getBitWidth() - 16);
+      break;
+    case Intrinsic::xcore_int:
+    case Intrinsic::xcore_inct:
+      // High bits are known to be zero.
+      Known.Zero =
+          APInt::getHighBitsSet(Known.getBitWidth(), Known.getBitWidth() - 8);
+      break;
+    case Intrinsic::xcore_testct:
+      // Result is either 0 or 1.
+      Known.Zero =
+          APInt::getHighBitsSet(Known.getBitWidth(), Known.getBitWidth() - 1);
+      break;
+    case Intrinsic::xcore_testwct:
+      // Result is in the range 0 - 4.
+      Known.Zero =
+          APInt::getHighBitsSet(Known.getBitWidth(), Known.getBitWidth() - 3);
+      break;
+    }
     }
     break;
   }

From 687c51a3972af17b3f225e692e79fd898a1b6f95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 2 Jan 2024 12:06:27 +0100
Subject: [PATCH 024/313] [clang][Interp][NFC] Remove unused using alias

---
 clang/lib/AST/Interp/Interp.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index a240d74d63425..828d4ea35526d 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -37,7 +37,6 @@
 namespace clang {
 namespace interp {
 
-using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
 
 /// Convert a value to an APValue.

From c01e844a7ea7cce4d9477b04d2c9ccaff3606f04 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 2 Jan 2024 13:24:42 +0000
Subject: [PATCH 025/313] [AMDGPU] Update compute program resource registers
 for GFX12 (#75911)

Co-authored-by: Konstantin Zhuravlyov <kzhuravl@amd.com>
---
 llvm/docs/AMDGPUUsage.rst                     |  59 ++++++++--
 .../llvm/Support/AMDHSAKernelDescriptor.h     |  44 ++++++--
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |   9 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |  52 ++++++---
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |   4 +-
 llvm/test/MC/AMDGPU/hsa-diag-v4.s             |  26 +++--
 llvm/test/MC/AMDGPU/hsa-gfx12-v4.s            |   2 -
 .../tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s  | 105 ++++++++++++++++++
 8 files changed, 251 insertions(+), 50 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f0c81bf878f7a..371e583c22a83 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4406,7 +4406,15 @@ The fields used by CP for code objects before V3 also match those specified in
                                                        ``COMPUTE_PGM_RSRC3``
                                                        configuration
                                                        register. See
-                                                       :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table`.
+                                                       :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`.
+                                                     GFX12
+                                                       Compute Shader (CS)
+                                                       program settings used by
+                                                       CP to set up
+                                                       ``COMPUTE_PGM_RSRC3``
+                                                       configuration
+                                                       register. See
+                                                       :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx12-table`.
      415:384 4 bytes COMPUTE_PGM_RSRC1               Compute Shader (CS)
                                                      program settings used by
                                                      CP to set up
@@ -4831,13 +4839,16 @@ The fields used by CP for code objects before V3 also match those specified in
 
                                                      Used by CP to set up
                                                      ``COMPUTE_PGM_RSRC2.USER_SGPR``.
-     6       1 bit   ENABLE_TRAP_HANDLER             Must be 0.
+     6       1 bit   ENABLE_TRAP_HANDLER             GFX6-GFX11
+                                                       Must be 0.
 
-                                                     This bit represents
-                                                     ``COMPUTE_PGM_RSRC2.TRAP_PRESENT``,
-                                                     which is set by the CP if
-                                                     the runtime has installed a
-                                                     trap handler.
+                                                       This bit represents
+                                                       ``COMPUTE_PGM_RSRC2.TRAP_PRESENT``,
+                                                       which is set by the CP if
+                                                       the runtime has installed a
+                                                       trap handler.
+                                                     GFX12
+                                                       Reserved, must be 0.
      7       1 bit   ENABLE_SGPR_WORKGROUP_ID_X      Enable the setup of the
                                                      system SGPR register for
                                                      the work-group id in the X
@@ -4957,7 +4968,7 @@ The fields used by CP for code objects before V3 also match those specified in
      30      1 bit   ENABLE_EXCEPTION_INT_DIVIDE_BY  Integer Division by Zero
                      _ZERO                           (rcp_iflag_f32 instruction
                                                      only)
-     31      1 bit                                   Reserved, must be 0.
+     31      1 bit   RESERVED                        Reserved, must be 0.
      32      **Total size 4 bytes.**
      ======= ===================================================================================================================
 
@@ -4986,8 +4997,8 @@ The fields used by CP for code objects before V3 also match those specified in
 
 ..
 
-  .. table:: compute_pgm_rsrc3 for GFX10-GFX12
-     :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table
+  .. table:: compute_pgm_rsrc3 for GFX10-GFX11
+     :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table
 
      ======= ======= =============================== ===========================================================================
      Bits    Size    Field Name                      Description
@@ -5036,6 +5047,32 @@ The fields used by CP for code objects before V3 also match those specified in
      32      **Total size 4 bytes.**
      ======= ===================================================================================================================
 
+..
+
+  .. table:: compute_pgm_rsrc3 for GFX12
+     :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx12-table
+
+     ======= ======= =============================== ===========================================================================
+     Bits    Size    Field Name                      Description
+     ======= ======= =============================== ===========================================================================
+     3:0     4 bits  RESERVED                        Reserved, must be 0.
+     11:4    8 bits  INST_PREF_SIZE                  Number of instruction bytes to prefetch, starting at the kernel's entry
+                                                     point instruction, before wavefront starts execution. The value is 0..255
+                                                     with a granularity of 128 bytes.
+     12      1 bit   RESERVED                        Reserved, must be 0.
+     13      1 bit   GLG_EN                          If 1, group launch guarantee will be enabled for this dispatch
+     30:14   17 bits RESERVED                        Reserved, must be 0.
+     31      1 bit   IMAGE_OP                        If 1, the kernel execution contains image instructions. If executed as
+                                                     part of a graphics pipeline, image read instructions will stall waiting
+                                                     for any necessary ``WAIT_SYNC`` fence to be performed in order to
+                                                     indicate that earlier pipeline stages have completed writing to the
+                                                     image.
+
+                                                     Not used for compute kernels that are not part of a graphics pipeline and
+                                                     must be 0.
+     32      **Total size 4 bytes.**
+     ======= ===================================================================================================================
+
 ..
 
   .. table:: Floating Point Rounding Mode Enumeration Values
@@ -15508,7 +15545,7 @@ terminated by an ``.end_amdhsa_kernel`` directive.
      ``.amdhsa_forward_progress``                             0                   GFX10-GFX12  Controls FWD_PROGRESS in
                                                                                                :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`.
      ``.amdhsa_shared_vgpr_count``                            0                   GFX10-GFX11  Controls SHARED_VGPR_COUNT in
-                                                                                               :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table`.
+                                                                                               :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`.
      ``.amdhsa_exception_fp_ieee_invalid_op``                 0                   GFX6-GFX12   Controls ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION in
                                                                                                :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`.
      ``.amdhsa_exception_fp_denorm_src``                      0                   GFX6-GFX12   Controls ENABLE_EXCEPTION_FP_DENORMAL_SOURCE in
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 2de2cf4185d86..84cac3ef700e0 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -127,12 +127,20 @@ enum : int32_t {
 #undef COMPUTE_PGM_RSRC1
 
 // Compute program resource register 2. Must match hardware definition.
+// GFX6+.
 #define COMPUTE_PGM_RSRC2(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_ ## NAME, SHIFT, WIDTH)
+// [GFX6-GFX11].
+#define COMPUTE_PGM_RSRC2_GFX6_GFX11(NAME, SHIFT, WIDTH)                       \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX6_GFX11_##NAME, SHIFT, WIDTH)
+// GFX12+.
+#define COMPUTE_PGM_RSRC2_GFX12_PLUS(NAME, SHIFT, WIDTH)                       \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX12_PLUS_##NAME, SHIFT, WIDTH)
 enum : int32_t {
   COMPUTE_PGM_RSRC2(ENABLE_PRIVATE_SEGMENT, 0, 1),
   COMPUTE_PGM_RSRC2(USER_SGPR_COUNT, 1, 5),
-  COMPUTE_PGM_RSRC2(ENABLE_TRAP_HANDLER, 6, 1),
+  COMPUTE_PGM_RSRC2_GFX6_GFX11(ENABLE_TRAP_HANDLER, 6, 1),
+  COMPUTE_PGM_RSRC2_GFX12_PLUS(RESERVED1, 6, 1),
   COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
   COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
   COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
@@ -166,23 +174,37 @@ enum : int32_t {
 
 // Compute program resource register 3 for GFX10+. Must match hardware
 // definition.
-// [GFX10].
-#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \
-  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_ ## NAME, SHIFT, WIDTH)
 // GFX10+.
 #define COMPUTE_PGM_RSRC3_GFX10_PLUS(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_PLUS_ ## NAME, SHIFT, WIDTH)
+// [GFX10].
+#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH)                            \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_##NAME, SHIFT, WIDTH)
+// [GFX10-GFX11].
+#define COMPUTE_PGM_RSRC3_GFX10_GFX11(NAME, SHIFT, WIDTH)                      \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX11_##NAME, SHIFT, WIDTH)
 // GFX11+.
 #define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_ ## NAME, SHIFT, WIDTH)
+// [GFX11].
+#define COMPUTE_PGM_RSRC3_GFX11(NAME, SHIFT, WIDTH)                            \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_##NAME, SHIFT, WIDTH)
+// GFX12+.
+#define COMPUTE_PGM_RSRC3_GFX12_PLUS(NAME, SHIFT, WIDTH)                       \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX12_PLUS_##NAME, SHIFT, WIDTH)
 enum : int32_t {
-  COMPUTE_PGM_RSRC3_GFX10_PLUS(SHARED_VGPR_COUNT, 0, 4),
-  COMPUTE_PGM_RSRC3_GFX10(RESERVED0, 4, 8),
-  COMPUTE_PGM_RSRC3_GFX11_PLUS(INST_PREF_SIZE, 4, 6),
-  COMPUTE_PGM_RSRC3_GFX11_PLUS(TRAP_ON_START, 10, 1),
-  COMPUTE_PGM_RSRC3_GFX11_PLUS(TRAP_ON_END, 11, 1),
-  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED1, 12, 19),
-  COMPUTE_PGM_RSRC3_GFX10(RESERVED2, 31, 1),
+  COMPUTE_PGM_RSRC3_GFX10_GFX11(SHARED_VGPR_COUNT, 0, 4),
+  COMPUTE_PGM_RSRC3_GFX12_PLUS(RESERVED0, 0, 4),
+  COMPUTE_PGM_RSRC3_GFX10(RESERVED1, 4, 8),
+  COMPUTE_PGM_RSRC3_GFX11(INST_PREF_SIZE, 4, 6),
+  COMPUTE_PGM_RSRC3_GFX11(TRAP_ON_START, 10, 1),
+  COMPUTE_PGM_RSRC3_GFX11(TRAP_ON_END, 11, 1),
+  COMPUTE_PGM_RSRC3_GFX12_PLUS(INST_PREF_SIZE, 4, 8),
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED2, 12, 1),
+  COMPUTE_PGM_RSRC3_GFX10_GFX11(RESERVED3, 13, 1),
+  COMPUTE_PGM_RSRC3_GFX12_PLUS(GLG_EN, 13, 1),
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED4, 14, 17),
+  COMPUTE_PGM_RSRC3_GFX10(RESERVED5, 31, 1),
   COMPUTE_PGM_RSRC3_GFX11_PLUS(IMAGE_OP, 31, 1),
 };
 #undef COMPUTE_PGM_RSRC3_GFX10_PLUS
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3b69a37728ea1..abd7e911beef3 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5416,11 +5416,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Val,
                        ValRange);
     } else if (ID == ".amdhsa_shared_vgpr_count") {
-      if (IVersion.Major < 10)
-        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+      if (IVersion.Major < 10 || IVersion.Major >= 12)
+        return Error(IDRange.Start, "directive requires gfx10 or gfx11",
+                     IDRange);
       SharedVGPRCount = Val;
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3,
-                       COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT, Val,
+                       COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, Val,
                        ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
       PARSE_BITS_ENTRY(
@@ -5522,7 +5523,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                     (AccumOffset / 4 - 1));
   }
 
-  if (IVersion.Major >= 10) {
+  if (IVersion.Major >= 10 && IVersion.Major < 12) {
     // SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS
     if (SharedVGPRCount && EnableWavefrontSize32 && *EnableWavefrontSize32) {
       return TokError("shared_vgpr_count directive not valid on "
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 6017634a73d1a..67be7b0fd6421 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1999,34 +1999,60 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
     if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED1)
       return MCDisassembler::Fail;
   } else if (isGFX10Plus()) {
-    if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
-      PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
-                      COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
+    // Bits [0-3].
+    if (!isGFX12Plus()) {
+      if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
+        PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
+                        COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
+      } else {
+        PRINT_PSEUDO_DIRECTIVE_COMMENT(
+            "SHARED_VGPR_COUNT",
+            COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
+      }
     } else {
-      PRINT_PSEUDO_DIRECTIVE_COMMENT(
-          "SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
+      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0)
+        return MCDisassembler::Fail;
     }
 
-    if (isGFX11Plus()) {
+    // Bits [4-11].
+    if (isGFX11()) {
       PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
-                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_INST_PREF_SIZE);
+                                     COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE);
       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
-                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
+                                     COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START);
       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
-                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_END);
+                                     COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END);
+    } else if (isGFX12Plus()) {
+      PRINT_PSEUDO_DIRECTIVE_COMMENT(
+          "INST_PREF_SIZE", COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE);
+    } else {
+      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED1)
+        return MCDisassembler::Fail;
+    }
+
+    // Bits [12].
+    if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2)
+      return MCDisassembler::Fail;
+
+    // Bits [13].
+    if (isGFX12Plus()) {
+      PRINT_PSEUDO_DIRECTIVE_COMMENT("GLG_EN",
+                                     COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN);
     } else {
-      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED0)
+      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3)
         return MCDisassembler::Fail;
     }
 
-    if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED1)
+    // Bits [14-30].
+    if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4)
       return MCDisassembler::Fail;
 
+    // Bits [31].
     if (isGFX11Plus()) {
       PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
-                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
+                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP);
     } else {
-      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED2)
+      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED5)
         return MCDisassembler::Fail;
     }
   } else if (FourByteBuffer) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index a855cf585205b..e135a4e25dd15 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -475,8 +475,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
                 compute_pgm_rsrc1,
                 amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
+  }
+  if (IVersion.Major >= 10 && IVersion.Major < 12) {
     PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
+                amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
   }
   if (IVersion.Major >= 12)
     PRINT_FIELD(OS, ".amdhsa_round_robin_scheduling", KD, compute_pgm_rsrc1,
diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
index f7a554aedb746..069b71b7229cd 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
@@ -1,6 +1,7 @@
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,PREGFX10,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX10,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX11,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX12,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,PREGFX10,AMDHSA,ALL
 
@@ -10,6 +11,7 @@
 // GFX8-NOT: error:
 // GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1010:xnack+
 // GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1100
+// GFX12: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1200
 // NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-unknown--gfx810
 .warning "test_target"
 .amdgcn_target "amdgcn-amd-amdhsa--gfx810:xnack+"
@@ -228,8 +230,10 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_shared_vgpr_count_invalid1
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: .amdhsa_next_free_vgpr directive is required
+// PREGFX10: error: directive requires gfx10 or gfx11
+// GFX10: error: .amdhsa_next_free_vgpr directive is required
+// GFX11: error: .amdhsa_next_free_vgpr directive is required
+// GFX12: error: directive requires gfx10 or gfx11
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_shared_vgpr_count_invalid1"
 .amdhsa_kernel test_amdhsa_shared_vgpr_count_invalid1
@@ -237,8 +241,10 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_shared_vgpr_count_invalid2
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: shared_vgpr_count directive not valid on wavefront size 32
+// PREGFX10: error: directive requires gfx10 or gfx11
+// GFX10: error: shared_vgpr_count directive not valid on wavefront size 32
+// GFX11: error: shared_vgpr_count directive not valid on wavefront size 32
+// GFX12: error: directive requires gfx10 or gfx11
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_shared_vgpr_count_invalid2"
 .amdhsa_kernel test_amdhsa_shared_vgpr_count_invalid2
@@ -249,8 +255,10 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_shared_vgpr_count_invalid3
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: value out of range
+// PREGFX10: error: directive requires gfx10 or gfx11
+// GFX10: error: value out of range
+// GFX11: error: value out of range
+// GFX12: error: directive requires gfx10 or gfx11
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_shared_vgpr_count_invalid3"
 .amdhsa_kernel test_amdhsa_shared_vgpr_count_invalid3
@@ -260,8 +268,10 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_shared_vgpr_count_invalid4
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: shared_vgpr_count*2 + compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot exceed 63
+// PREGFX10: error: directive requires gfx10 or gfx11
+// GFX10: error: shared_vgpr_count*2 + compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot exceed 63
+// GFX11: error: shared_vgpr_count*2 + compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot exceed 63
+// GFX12: error: directive requires gfx10 or gfx11
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_shared_vgpr_count_invalid4"
 .amdhsa_kernel test_amdhsa_shared_vgpr_count_invalid4
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
index efbcec21f586b..186d98f78b986 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
@@ -118,7 +118,6 @@ disabled_user_sgpr:
   .amdhsa_workgroup_processor_mode 1
   .amdhsa_memory_ordered 1
   .amdhsa_forward_progress 1
-  .amdhsa_shared_vgpr_count 0
   .amdhsa_round_robin_scheduling 1
   .amdhsa_exception_fp_ieee_invalid_op 1
   .amdhsa_exception_fp_denorm_src 1
@@ -157,7 +156,6 @@ disabled_user_sgpr:
 // ASM-NEXT: .amdhsa_workgroup_processor_mode 1
 // ASM-NEXT: .amdhsa_memory_ordered 1
 // ASM-NEXT: .amdhsa_forward_progress 1
-// ASM-NEXT: .amdhsa_shared_vgpr_count 0
 // ASM-NEXT: .amdhsa_round_robin_scheduling 1
 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
 // ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s
new file mode 100644
index 0000000000000..e1d312d6035cb
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s
@@ -0,0 +1,105 @@
+;; Test disassembly for gfx12 kernel descriptor.
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=obj -mcpu=gfx1200 < 1.s > 1.o
+; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=obj -mcpu=gfx1200 < 1-disasm.s > 1-disasm.o
+; RUN: cmp 1.o 1-disasm.o
+; CHECK: .amdhsa_kernel kernel
+; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0
+; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0
+; CHECK-NEXT: .amdhsa_kernarg_size 0
+; CHECK-NEXT: ; INST_PREF_SIZE 0
+; CHECK-NEXT: ; GLG_EN 0
+; CHECK-NEXT: ; IMAGE_OP 0
+; CHECK-NEXT: .amdhsa_next_free_vgpr 32
+; CHECK-NEXT: .amdhsa_reserve_vcc 0
+; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT: .amdhsa_next_free_sgpr 8
+; CHECK-NEXT: .amdhsa_float_round_mode_32 0
+; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0
+; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0
+; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3
+; CHECK-NEXT: .amdhsa_fp16_overflow 0
+; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1
+; CHECK-NEXT: .amdhsa_memory_ordered 1
+; CHECK-NEXT: .amdhsa_forward_progress 0
+; CHECK-NEXT: .amdhsa_round_robin_scheduling 0
+; CHECK-NEXT: .amdhsa_enable_private_segment 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
+; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0
+; CHECK-NEXT: .amdhsa_exception_int_div_zero 0
+; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_wavefront_size32 1
+; CHECK-NEXT: .end_amdhsa_kernel
+.amdhsa_kernel kernel
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+  .amdhsa_wavefront_size32 1
+.end_amdhsa_kernel
+
+;--- 2.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-wavefrontsize32,+wavefrontsize64 -filetype=obj -mcpu=gfx1200 < 2.s > 2.o
+; RUN: llvm-objdump --disassemble-symbols=kernel.kd 2.o | tail -n +7 | tee 2-disasm.s | FileCheck 2.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-wavefrontsize32,+wavefrontsize64 -filetype=obj -mcpu=gfx1200 < 2-disasm.s > 2-disasm.o
+; RUN: cmp 2.o 2-disasm.o
+; CHECK: .amdhsa_kernel kernel
+; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0
+; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0
+; CHECK-NEXT: .amdhsa_kernarg_size 0
+; CHECK-NEXT: ; INST_PREF_SIZE 0
+; CHECK-NEXT: ; GLG_EN 0
+; CHECK-NEXT: ; IMAGE_OP 0
+; CHECK-NEXT: .amdhsa_next_free_vgpr 32
+; CHECK-NEXT: .amdhsa_reserve_vcc 0
+; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT: .amdhsa_next_free_sgpr 8
+; CHECK-NEXT: .amdhsa_float_round_mode_32 0
+; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0
+; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0
+; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3
+; CHECK-NEXT: .amdhsa_fp16_overflow 0
+; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1
+; CHECK-NEXT: .amdhsa_memory_ordered 1
+; CHECK-NEXT: .amdhsa_forward_progress 0
+; CHECK-NEXT: .amdhsa_round_robin_scheduling 0
+; CHECK-NEXT: .amdhsa_enable_private_segment 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
+; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0
+; CHECK-NEXT: .amdhsa_exception_int_div_zero 0
+; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .end_amdhsa_kernel
+.amdhsa_kernel kernel
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+  .amdhsa_wavefront_size32 0
+.end_amdhsa_kernel

From 534034737a652a7f59ede2ac3553bff4ad97594f Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi@nextsilicon.com>
Date: Tue, 2 Jan 2024 14:27:10 +0100
Subject: [PATCH 026/313] [mlir][llvm] Import call site calling conventions
 (#76391)

This revision adds support for importing call site calling conventions.
Additionally, the revision also adds a roundtrip test for an indirect
call with a non-standard calling convention.
---
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       |  2 ++
 mlir/test/Dialect/LLVMIR/func.mlir            |  6 ++--
 .../LLVMIR/Import/calling-convention.ll       | 32 +++++++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/Import/calling-convention.ll

diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 905405e939882..528f113ea6f7f 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1439,6 +1439,7 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     } else {
       callOp = builder.create<CallOp>(loc, funcTy, operands);
     }
+    callOp.setCConv(convertCConvFromLLVM(callInst->getCallingConv()));
     setFastmathFlagsAttr(inst, callOp);
     if (!callInst->getType()->isVoidTy())
       mapValue(inst, callOp.getResult());
@@ -1516,6 +1517,7 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
           loc, funcTy, /*callee=*/nullptr, operands, directNormalDest,
           ValueRange(), lookupBlock(invokeInst->getUnwindDest()), unwindArgs);
     }
+    invokeOp.setCConv(convertCConvFromLLVM(invokeInst->getCallingConv()));
     if (!invokeInst->getType()->isVoidTy())
       mapValue(inst, invokeOp.getResults().front());
     else
diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir
index d09df07676122..9dc1bc57034e0 100644
--- a/mlir/test/Dialect/LLVMIR/func.mlir
+++ b/mlir/test/Dialect/LLVMIR/func.mlir
@@ -191,14 +191,16 @@ module {
 
   // CHECK: llvm.func @test_ccs
   llvm.func @test_ccs() {
+    // CHECK-NEXT: %[[PTR:.*]] = llvm.mlir.addressof @cconv4 : !llvm.ptr
+    %ptr = llvm.mlir.addressof @cconv4 : !llvm.ptr
     // CHECK-NEXT: llvm.call        @cconv1() : () -> ()
     // CHECK-NEXT: llvm.call        @cconv2() : () -> ()
     // CHECK-NEXT: llvm.call fastcc @cconv3() : () -> ()
-    // CHECK-NEXT: llvm.call cc_10  @cconv4() : () -> ()
+    // CHECK-NEXT: llvm.call cc_10  %[[PTR]]() : !llvm.ptr, () -> ()
     llvm.call        @cconv1() : () -> ()
     llvm.call ccc    @cconv2() : () -> ()
     llvm.call fastcc @cconv3() : () -> ()
-    llvm.call cc_10  @cconv4() : () -> ()
+    llvm.call cc_10  %ptr() : !llvm.ptr, () -> ()
     llvm.return
   }
 
diff --git a/mlir/test/Target/LLVMIR/Import/calling-convention.ll b/mlir/test/Target/LLVMIR/Import/calling-convention.ll
new file mode 100644
index 0000000000000..bf5dc6a694f2a
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/Import/calling-convention.ll
@@ -0,0 +1,32 @@
+; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s
+
+; CHECK: llvm.func fastcc @cconv_fastcc()
+declare fastcc void @cconv_fastcc()
+; CHECK: llvm.func @cconv_ccc()
+declare ccc void @cconv_ccc()
+
+; CHECK-LABEL: @call_cconv
+define void @call_cconv() {
+  ; CHECK: llvm.call fastcc @cconv_fastcc()
+  call fastcc void @cconv_fastcc()
+  ; CHECK: llvm.call @cconv_ccc()
+  call ccc void @cconv_ccc()
+  ret void
+}
+
+; // -----
+
+; CHECK: llvm.func fastcc @cconv_fastcc()
+declare fastcc void @cconv_fastcc()
+declare i32 @__gxx_personality_v0(...)
+
+; CHECK-LABEL: @invoke_cconv
+define i32 @invoke_cconv() personality ptr @__gxx_personality_v0 {
+  ; CHECK: llvm.invoke fastcc @cconv_fastcc() to ^bb2 unwind ^bb1 : () -> ()
+  invoke fastcc void @cconv_fastcc() to label %bb2 unwind label %bb1
+bb1:
+  %1 = landingpad { ptr, i32 } cleanup
+  br label %bb2
+bb2:
+  ret i32 1
+}

From 9943d33997e6bb85ad054c18ce44d037040d8565 Mon Sep 17 00:00:00 2001
From: Enna1 <xumingjie.enna1@bytedance.com>
Date: Tue, 2 Jan 2024 21:32:18 +0800
Subject: [PATCH 027/313] [SLP][NFC] Fix assertion in vectorizeGEPIndices()
 (#76660)

The index constraints for the collected getelementptr instructions
should be single **and** non-constant.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6359dc65d2263..ced081d429164 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16224,7 +16224,7 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
       for (auto *V : Candidates) {
         auto *GEP = cast<GetElementPtrInst>(V);
         auto *GEPIdx = GEP->idx_begin()->get();
-        assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+        assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
         Bundle[BundleIndex++] = GEPIdx;
       }
 

From 0b3d1a0b1bea12846c34adfdd19c8d7f930620ea Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Tue, 2 Jan 2024 13:35:54 +0000
Subject: [PATCH 028/313] [RISCV][test] Add tests for
 RISCVInstrInfo::describeLoadedValue (#76041)

Tests are in preparation for adding handling of the load of a constant
value as Mips does (noted in

<https://github.com/llvm/llvm-project/pull/72356#discussion_r1395203532>).

I've opted to implement these tests as a C++ unit test as on balance I
_think_ it's easier to follow and maintain than .mir tests trying to
indirectly test this function. That said, you see the limitations with
the test of describeLoadedValue on a memory operation where we'd rather
pass `MachinePointerInfo::getFixedStack` but can't because we'd need to
then ensure the necessary stack metadata for the function is present.
---
 .../Target/RISCV/RISCVInstrInfoTest.cpp       | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index 5ef86bb3f7b46..5836239bc56fd 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -10,6 +10,7 @@
 #include "RISCVSubtarget.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -174,6 +175,83 @@ TEST_P(RISCVInstrInfoTest, GetMemOperandsWithOffsetWidth) {
   EXPECT_EQ(Width, 4u);
 }
 
+static void expectDIEPrintResult(const DIExpression *Expr, StringRef Expected) {
+  std::string Output;
+  raw_string_ostream OS(Output);
+  Expr->print(OS);
+  OS.flush();
+  EXPECT_EQ(OS.str(), Expected);
+}
+
+TEST_P(RISCVInstrInfoTest, DescribeLoadedValue) {
+  const RISCVInstrInfo *TII = ST->getInstrInfo();
+  DebugLoc DL;
+
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+
+  // Register move.
+  auto *MI1 = BuildMI(*MBB, MBB->begin(), DL, TII->get(RISCV::ADDI), RISCV::X1)
+                  .addReg(RISCV::X2)
+                  .addImm(0)
+                  .getInstr();
+  EXPECT_FALSE(TII->describeLoadedValue(*MI1, RISCV::X2).has_value());
+  std::optional<ParamLoadedValue> MI1Res =
+      TII->describeLoadedValue(*MI1, RISCV::X1);
+  ASSERT_TRUE(MI1Res.has_value());
+  ASSERT_TRUE(MI1Res->first.isReg());
+  EXPECT_EQ(MI1Res->first.getReg(), RISCV::X2);
+  expectDIEPrintResult(MI1Res->second, "!DIExpression()");
+
+  // Load immediate.
+  auto *MI2 = BuildMI(*MBB, MBB->begin(), DL, TII->get(RISCV::ADDI), RISCV::X3)
+                  .addReg(RISCV::X0)
+                  .addImm(111)
+                  .getInstr();
+  std::optional<ParamLoadedValue> MI2Res =
+      TII->describeLoadedValue(*MI2, RISCV::X3);
+  ASSERT_TRUE(MI2Res.has_value());
+  ASSERT_TRUE(MI2Res->first.isReg());
+  EXPECT_EQ(MI2Res->first.getReg(), RISCV::X0);
+  // TODO: Could be a DW_OP_constu if this is recognised as a immediate load
+  // rather than just an addi.
+  expectDIEPrintResult(MI2Res->second, "!DIExpression(DW_OP_plus_uconst, 111)");
+
+  // Add immediate.
+  auto *MI3 = BuildMI(*MBB, MBB->begin(), DL, TII->get(RISCV::ADDI), RISCV::X2)
+                  .addReg(RISCV::X3)
+                  .addImm(222)
+                  .getInstr();
+  std::optional<ParamLoadedValue> MI3Res =
+      TII->describeLoadedValue(*MI3, RISCV::X2);
+  ASSERT_TRUE(MI3Res.has_value());
+  ASSERT_TRUE(MI3Res->first.isReg());
+  EXPECT_EQ(MI3Res->first.getReg(), RISCV::X3);
+  expectDIEPrintResult(MI3Res->second, "!DIExpression(DW_OP_plus_uconst, 222)");
+
+  // Load value from memory.
+  // It would be better (more reflective of real-world describeLoadedValue
+  // usage) to test using MachinePointerInfo::getFixedStack, but
+  // unfortunately it would be overly fiddly to make this work.
+  auto MMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+                                      MachineMemOperand::MOLoad, 1, Align(1));
+  auto *MI4 = BuildMI(*MBB, MBB->begin(), DL, TII->get(RISCV::LB), RISCV::X1)
+                  .addReg(RISCV::X2)
+                  .addImm(-128)
+                  .addMemOperand(MMO)
+                  .getInstr();
+  std::optional<ParamLoadedValue> MI4Res =
+      TII->describeLoadedValue(*MI4, RISCV::X1);
+  ASSERT_TRUE(MI4Res.has_value());
+  ASSERT_TRUE(MI4Res->first.isReg());
+  EXPECT_EQ(MI4Res->first.getReg(), RISCV::X2);
+  expectDIEPrintResult(
+      MI4Res->second,
+      "!DIExpression(DW_OP_constu, 128, DW_OP_minus, DW_OP_deref_size, 1)");
+
+  MF->deleteMachineBasicBlock(MBB);
+}
+
 } // namespace
 
 INSTANTIATE_TEST_SUITE_P(RV32And64, RISCVInstrInfoTest,

From 91e8700bd6adf4587dcc1b3e2c43940f81220da1 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen@sifive.com>
Date: Tue, 2 Jan 2024 21:39:08 +0800
Subject: [PATCH 029/313] [RISCV] Add overlapping constraints flag to RVV
 pseudo (#76489)

This patch update some missing overlapping constraints flag in following
pseudo:

- VPseudoUnaryMaskRoundingMode
- VPseudoTiedBinaryCarryIn
- VPseudoTiedBinaryV_VM
- VPseudoTiedBinaryV_XM
- PseudoVEXT_VF2|4|8
- VPseudoConversionRoundingMode
- VPseudoUnaryNoMask_FRM
- VPseudoUnaryMask_FRM
- VPseudoConversionRM
- VPseudoVNCVTI_RM_W
---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 58 ++++++++++++-------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index be4bc3b58766e..30deeaa064486 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -1069,7 +1069,8 @@ class VPseudoUnaryMask<VReg RetClass,
 
 class VPseudoUnaryMaskRoundingMode<VReg RetClass,
                                    VReg OpClass,
-                                   string Constraint = ""> :
+                                   string Constraint = "",
+                                   int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$rm,
@@ -1079,6 +1080,7 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass,
   let mayStore = 0;
   let hasSideEffects = 0;
   let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1106,7 +1108,8 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass,
 
 class VPseudoUnaryNoMask_FRM<VReg RetClass,
                              VReg OpClass,
-                             string Constraint = ""> :
+                             string Constraint = "",
+                             int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
              (ins RetClass:$merge, OpClass:$rs2, ixlenimm:$frm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
@@ -1115,6 +1118,7 @@ class VPseudoUnaryNoMask_FRM<VReg RetClass,
   let mayStore = 0;
   let hasSideEffects = 0;
   let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1123,7 +1127,8 @@ class VPseudoUnaryNoMask_FRM<VReg RetClass,
 
 class VPseudoUnaryMask_FRM<VReg RetClass,
                            VReg OpClass,
-                           string Constraint = ""> :
+                           string Constraint = "",
+                           int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$frm,
@@ -1133,6 +1138,7 @@ class VPseudoUnaryMask_FRM<VReg RetClass,
   let mayStore = 0;
   let hasSideEffects = 0;
   let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1528,7 +1534,8 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
                                DAGOperand Op2Class,
                                LMULInfo MInfo,
                                bit CarryIn,
-                               string Constraint> :
+                               string Constraint,
+                               int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
              !if(CarryIn,
                 (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1,
@@ -1540,6 +1547,7 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
   let mayStore = 0;
   let hasSideEffects = 0;
   let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 0;
@@ -2447,10 +2455,11 @@ multiclass VPseudoBinaryV_VM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                          m.vrclass, m.vrclass, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
-multiclass VPseudoTiedBinaryV_VM<LMULInfo m> {
+multiclass VPseudoTiedBinaryV_VM<LMULInfo m, int TargetConstraintType = 1> {
   def "_VVM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                             m.vrclass, m.vrclass, m, 1, "">;
+                             m.vrclass, m.vrclass, m, 1, "",
+                             TargetConstraintType>;
 }
 
 multiclass VPseudoBinaryV_XM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
@@ -2462,10 +2471,11 @@ multiclass VPseudoBinaryV_XM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                          m.vrclass, GPR, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
-multiclass VPseudoTiedBinaryV_XM<LMULInfo m> {
+multiclass VPseudoTiedBinaryV_XM<LMULInfo m, int TargetConstraintType = 1> {
   def "_VXM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                             m.vrclass, GPR, m, 1, "">;
+                             m.vrclass, GPR, m, 1, "",
+                             TargetConstraintType>;
 }
 
 multiclass VPseudoVMRG_FM {
@@ -2596,45 +2606,48 @@ multiclass VPseudoVRCP_V_RM {
   }
 }
 
-multiclass PseudoVEXT_VF2<int TargetConstraintType = 1> {
+multiclass PseudoVEXT_VF2 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF2 in {
     defvar mx = m.MX;
+    defvar CurrTypeConstraints = !if(!or(!eq(mx, "MF4"), !eq(mx, "MF2"), !eq(mx, "M1")), 1, 3);
     let VLMul = m.value in {
-      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints, TargetConstraintType>,
+      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints, CurrTypeConstraints>,
                      SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
       def "_" # mx # "_MASK" :
-        VPseudoUnaryMask<m.vrclass, m.f2vrclass, constraints, TargetConstraintType>,
+        VPseudoUnaryMask<m.vrclass, m.f2vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
         SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
     }
   }
 }
 
-multiclass PseudoVEXT_VF4<int TargetConstraintType = 1> {
+multiclass PseudoVEXT_VF4 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
+    defvar CurrTypeConstraints = !if(!or(!eq(mx, "MF2"), !eq(mx, "M1"), !eq(mx, "M2")), 1, 3);
     let VLMul = m.value in {
-      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints, TargetConstraintType>,
+      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints, CurrTypeConstraints>,
                      SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
       def "_" # mx # "_MASK" :
-        VPseudoUnaryMask<m.vrclass, m.f4vrclass, constraints, TargetConstraintType>,
+        VPseudoUnaryMask<m.vrclass, m.f4vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
         SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
     }
   }
 }
 
-multiclass PseudoVEXT_VF8<int TargetConstraintType = 1> {
+multiclass PseudoVEXT_VF8 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF8 in {
     defvar mx = m.MX;
+    defvar CurrTypeConstraints = !if(!or(!eq(mx, "M1"), !eq(mx, "M2"), !eq(mx, "M4")), 1, 3);
     let VLMul = m.value in {
-      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints, TargetConstraintType>,
+      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints, CurrTypeConstraints>,
                      SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
       def "_" # mx # "_MASK" :
-        VPseudoUnaryMask<m.vrclass, m.f8vrclass, constraints, TargetConstraintType>,
+        VPseudoUnaryMask<m.vrclass, m.f8vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
         SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
     }
@@ -3619,7 +3632,7 @@ multiclass VPseudoConversionRoundingMode<VReg RetClass,
   let VLMul = MInfo.value in {
     def "_" # MInfo.MX : VPseudoUnaryNoMaskRoundingMode<RetClass, Op1Class, Constraint, TargetConstraintType>;
     def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMaskRoundingMode<RetClass, Op1Class,
-                                                                Constraint>,
+                                                                Constraint, TargetConstraintType>,
                                    RISCVMaskedPseudo<MaskIdx=2>;
   }
 }
@@ -3628,12 +3641,13 @@ multiclass VPseudoConversionRoundingMode<VReg RetClass,
 multiclass VPseudoConversionRM<VReg RetClass,
                                VReg Op1Class,
                                LMULInfo MInfo,
-                               string Constraint = ""> {
+                               string Constraint = "",
+                               int TargetConstraintType = 1> {
   let VLMul = MInfo.value in {
     def "_" # MInfo.MX : VPseudoUnaryNoMask_FRM<RetClass, Op1Class,
-                                                        Constraint>;
+                                                        Constraint, TargetConstraintType>;
     def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMask_FRM<RetClass, Op1Class,
-                                                        Constraint>,
+                                                        Constraint, TargetConstraintType>,
                                    RISCVMaskedPseudo<MaskIdx=2>;
   }
 }
@@ -3761,7 +3775,7 @@ multiclass VPseudoVNCVTI_W_RM {
 multiclass VPseudoVNCVTI_RM_W {
   defvar constraint = "@earlyclobber $rd";
   foreach m = MxListW in {
-    defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint>,
+    defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX,
                          forceMergeOpRead=true>;
   }

From 33565750e49f683308fad3ba22a06fa7e75f592b Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Tue, 2 Jan 2024 14:45:33 +0100
Subject: [PATCH 030/313] [AMDGPU] Fix moveToValu for copy to phys SGPRs
 (#76715)

Fixes #76031
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 12 +++++
 llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll   | 25 ++++++-----
 .../si-fix-sgpr-copies-copy-to-sgpr.mir       | 44 +++++++++++++++++++
 3 files changed, 69 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index aae6f2e842fd1..396d22c7ec18d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7198,6 +7198,18 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     Register DstReg = Inst.getOperand(0).getReg();
     const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
 
+    // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
+    // hope for the best.
+    if (Inst.isCopy() && DstReg.isPhysical() &&
+        RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
+      // TODO: Only works for 32 bit registers.
+      BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+              get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
+          .add(Inst.getOperand(1));
+      Inst.eraseFromParent();
+      return;
+    }
+
     if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
         NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
       // Instead of creating a copy where src and dst are the same register
diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
index ac196635b363a..5c1a709372042 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
@@ -4,19 +4,20 @@
 define amdgpu_cs <2 x i32> @f() {
 ; CHECK-LABEL: f:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s3, s0
-; CHECK-NEXT:    s_mov_b32 s4, s0
-; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; CHECK-NEXT:    s_mov_b32 s5, s0
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    s_mov_b32 s6, s4
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s0, s4
+; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; CHECK-NEXT:    s_mov_b32 s1, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; CHECK-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[0:1], v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CHECK-NEXT:    ; return to shader part epilog
 bb:
   %i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
new file mode 100644
index 0000000000000..4292e76f37096
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=si-fix-sgpr-copies --verify-machineinstrs -o - %s | FileCheck %s
+
+# Copy to $sgpr0 is disconnected and becomes an IMPLICIT_DEF
+# Inserted V_AND_B32 defines virtual register after use.
+
+---
+name:            si_fix_sgpr_copies_breaks_function
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: si_fix_sgpr_copies_breaks_function
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+    ; CHECK-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
+    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -32768
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_1]]
+    ; CHECK-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 killed [[COPY1]], [[COPY2]], implicit $exec
+    ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_2]], [[V_XOR_B32_e64_]], implicit $exec
+    ; CHECK-NEXT: $sgpr0 = V_READFIRSTLANE_B32 [[V_AND_B32_e64_]], implicit $exec
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0
+    %0:sgpr_32 = COPY $sgpr0
+    %2:sreg_32 = S_MOV_B32 16
+    %3:sreg_32 = S_LSHR_B32 %0, killed %2, implicit-def dead $scc
+    %4:sreg_32 = COPY killed %3
+    %5:sreg_32 = S_MOV_B32 -32768
+    %7:vgpr_32 = COPY killed %5
+    %6:vgpr_32 = V_XOR_B32_e64 killed %4, %7, implicit $exec
+    %8:sreg_32 = S_MOV_B32 65535
+    %10:sreg_32 = COPY %6
+    %9:sreg_32 = S_AND_B32 killed %8, killed %10, implicit-def dead $scc
+    $sgpr0 = COPY %9
+    SI_RETURN_TO_EPILOG $sgpr0
+
+...

From a181b425659a22c5535d2513f7dd7c7cf14e2d69 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Tue, 2 Jan 2024 14:43:55 +0000
Subject: [PATCH 031/313] [llvm][NFC] Use SDValue::getConstantOperandAPInt(i)
 where possible

The helper function allows examples like
`cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();` to be changed
to `Op.getConstantOperandAPInt(0);`.

See #76708 for further context. Although there are far fewer
opportunities for replacement, I used a similar git grep and sed combo
as before, given I already had it to hand:

`git grep -l "cast<ConstantSDNode>\(.*->getOperand\(.*\)\)->getAPIntValue\(\)" | xargs sed -E -i 's/cast<ConstantSDNode>\((.*)->getOperand\((.*)\)\)->getAPIntValue\(\)/\1->getConstantOperandAPInt(\2)/'`
and
`git grep -l
"cast<ConstantSDNode>\(.*\.getOperand\(.*\)\)->getAPIntValue\(\)" |
xargs sed -E -i
's/cast<ConstantSDNode>\((.*)\.getOperand\((.*)\)\)->getAPIntValue\(\)/\1.getConstantOperandAPInt(\2)/'`
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp  | 2 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++--
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp   | 2 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp         | 4 ++--
 llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp       | 2 +-
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp     | 2 +-
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp         | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index eb4deb6306fd5..0e17bba2398ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5470,7 +5470,7 @@ static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
         Ops[i].getOperand(0).getValueType() != VT ||
         (IdentitySrc && Ops[i].getOperand(0) != IdentitySrc) ||
         !isa<ConstantSDNode>(Ops[i].getOperand(1)) ||
-        cast<ConstantSDNode>(Ops[i].getOperand(1))->getAPIntValue() != i) {
+        Ops[i].getConstantOperandAPInt(1) != i) {
       IsIdentity = false;
       break;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 26f013a18f382..102fd0c3dae2a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14245,7 +14245,7 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
   assert(VT != MVT::i64 && "Expected illegal VSCALE node");
 
   SDLoc DL(Op);
-  APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
+  APInt MulImm = Op.getConstantOperandAPInt(0);
   return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
                             VT);
 }
@@ -18341,7 +18341,7 @@ static bool isEssentiallyExtractHighSubvector(SDValue N) {
     return false;
   if (N.getOperand(0).getValueType().isScalableVector())
     return false;
-  return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
+  return N.getConstantOperandAPInt(1) ==
          N.getOperand(0).getValueType().getVectorNumElements() / 2;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 28604af2cdb3c..bffea82ab8f49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -724,7 +724,7 @@ bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
                                              unsigned ShAmtBits) const {
   assert(N->getOpcode() == ISD::AND);
 
-  const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  const APInt &RHS = N->getConstantOperandAPInt(1);
   if (RHS.countr_one() >= ShAmtBits)
     return true;
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index cf9646a0b81ed..9f3bcffc7a99f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14841,14 +14841,14 @@ static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
   assert(N->getOpcode() == ARMISD::BFI);
 
   SDValue From = N->getOperand(1);
-  ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
+  ToMask = ~N->getConstantOperandAPInt(2);
   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
 
   // If the Base came from a SHR #C, we can deduce that it is really testing bit
   // #C in the base of the SHR.
   if (From->getOpcode() == ISD::SRL &&
       isa<ConstantSDNode>(From->getOperand(1))) {
-    APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
+    APInt Shift = From->getConstantOperandAPInt(1);
     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
     FromMask <<= Shift.getLimitedValue(31);
     From = From->getOperand(0);
diff --git a/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 12b35a0e43443..01b41f3b21593 100644
--- a/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -330,7 +330,7 @@ bool MipsDAGToDAGISel::isUnneededShiftMask(SDNode *N,
                                            unsigned ShAmtBits) const {
   assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
 
-  const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  const APInt &RHS = N->getConstantOperandAPInt(1);
   if (RHS.countr_one() >= ShAmtBits) {
     LLVM_DEBUG(
         dbgs()
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index f6ac41ed3479c..e9788fa7ed739 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1519,7 +1519,7 @@ static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   EVT ResTy = Op->getValueType(0);
   APInt BitImm = APInt(ResTy.getScalarSizeInBits(), 1)
-                 << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+                 << Op->getConstantOperandAPInt(2);
   SDValue BitMask = DAG.getConstant(~BitImm, DL, ResTy);
 
   return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 846eab93e1fea..73b10cf3067e1 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -487,7 +487,7 @@ namespace {
     // from PatFrags in tablegen.
     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
       assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
-      const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+      const APInt &Val = N->getConstantOperandAPInt(1);
 
       if (Val.countr_one() >= Width)
         return true;

From 02347fc7191ff4d073f439dde6523add3f5496de Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Tue, 2 Jan 2024 13:24:12 +0100
Subject: [PATCH 032/313] Reapply "[Sema] Fix crash on invalid code with
 parenthesized aggregate initialization" (#76272)

With updates the libc++ tests.

This reverts commit 2205d2334f3c859ad9f6c65ed950bfb3bb6f7cbe and relands
86dc6e15f22610bbb53eb4efda0a178ecefc933a and
7ab16fb5207fe187ab999f882069bd632d2e68e5.

Original commit was reverted because of failing libc++ tests, see #76232 for
the discussion.

The errors in the tests are spurious in the first place (coming from initialization
of invalid classes), so update the tests to match new behavior that does
not show those errors.
---
 clang/lib/Sema/SemaInit.cpp                   |  8 ++++++
 clang/test/SemaCXX/crash-GH76228.cpp          | 28 +++++++++++++++++++
 clang/test/SemaCXX/paren-list-agg-init.cpp    |  2 +-
 .../transform_error.mandates.verify.cpp       |  8 ------
 .../transform_error.mandates.verify.cpp       |  8 ------
 5 files changed, 37 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/SemaCXX/crash-GH76228.cpp

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 61d244f3bb979..cc9db5ded1149 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -5512,6 +5512,14 @@ static void TryOrBuildParenListInitialization(
   } else if (auto *RT = Entity.getType()->getAs<RecordType>()) {
     bool IsUnion = RT->isUnionType();
     const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getDecl());
+    if (RD->isInvalidDecl()) {
+      // Exit early to avoid confusion when processing members.
+      // We do the same for braced list initialization in
+      // `CheckStructUnionTypes`.
+      Sequence.SetFailed(
+          clang::InitializationSequence::FK_ParenthesizedListInitFailed);
+      return;
+    }
 
     if (!IsUnion) {
       for (const CXXBaseSpecifier &Base : RD->bases()) {
diff --git a/clang/test/SemaCXX/crash-GH76228.cpp b/clang/test/SemaCXX/crash-GH76228.cpp
new file mode 100644
index 0000000000000..33a9395823127
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH76228.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -std=c++20 -verify %s
+// Check we don't crash on incomplete members and bases when handling parenthesized initialization.
+class incomplete; // expected-note@-0 3  {{forward declaration of 'incomplete'}}
+struct foo {
+  int a;
+  incomplete b;
+  // expected-error@-1 {{incomplete type}}
+};
+foo a1(0);
+
+struct one_int {
+    int a;
+};
+struct bar : one_int, incomplete {};
+// expected-error@-1 {{incomplete type}}
+bar a2(0);
+
+incomplete a3[3](1,2,3);
+// expected-error@-1 {{incomplete type}}
+
+struct qux : foo {
+};
+qux a4(0);
+
+struct fred {
+    foo a[3];
+};
+fred a5(0);
diff --git a/clang/test/SemaCXX/paren-list-agg-init.cpp b/clang/test/SemaCXX/paren-list-agg-init.cpp
index f60b20e0d4656..c1964a5a9eb00 100644
--- a/clang/test/SemaCXX/paren-list-agg-init.cpp
+++ b/clang/test/SemaCXX/paren-list-agg-init.cpp
@@ -289,7 +289,7 @@ int test() {
   // used to crash
   S a(0, 1);
   S b(0);
-  S c(0, 0, 1); // beforecxx20-warning {{aggregate initialization of type 'S' from a parenthesized list of values is a C++20 extension}}
+  S c(0, 0, 1);
 
   S d {0, 1};
   S e {0};
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
index 965e82a7b4034..3260a8cbc7f8e 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
@@ -46,11 +46,9 @@ void test() {
   {
     std::expected<int, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
   }
 
@@ -58,27 +56,21 @@ void test() {
   {
     const std::expected<int, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 }
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
index 09aa1332e9800..21dc247687918 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
@@ -46,11 +46,9 @@ void test() {
   {
     std::expected<void, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
   }
 
@@ -58,27 +56,21 @@ void test() {
   {
     const std::expected<void, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 }
 // clang-format on

From 9d5b0965c43b4e8b3f21c106fe829391b1382277 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 2 Jan 2024 16:13:14 +0100
Subject: [PATCH 033/313] [InstCombine] Add helper for commutative icmp folds
 (NFCI)

Add a common place for icmp folds that should be tried with both
operand orders, so we don't have to repeat this pattern for
individual folds.
---
 .../InstCombine/InstCombineCompares.cpp       | 88 ++++++++-----------
 .../InstCombine/InstCombineInternal.h         |  7 +-
 2 files changed, 41 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 54d74905b960e..3875e59c3ede3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5039,10 +5039,10 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
 }
 
 /// Fold icmp Pred min|max(X, Y), Z.
-Instruction *
-InstCombinerImpl::foldICmpWithMinMaxImpl(Instruction &I,
-                                         MinMaxIntrinsic *MinMax, Value *Z,
-                                         ICmpInst::Predicate Pred) {
+Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
+                                                  MinMaxIntrinsic *MinMax,
+                                                  Value *Z,
+                                                  ICmpInst::Predicate Pred) {
   Value *X = MinMax->getLHS();
   Value *Y = MinMax->getRHS();
   if (ICmpInst::isSigned(Pred) && !MinMax->isSigned())
@@ -5171,24 +5171,6 @@ InstCombinerImpl::foldICmpWithMinMaxImpl(Instruction &I,
 
   return nullptr;
 }
-Instruction *InstCombinerImpl::foldICmpWithMinMax(ICmpInst &Cmp) {
-  ICmpInst::Predicate Pred = Cmp.getPredicate();
-  Value *Lhs = Cmp.getOperand(0);
-  Value *Rhs = Cmp.getOperand(1);
-
-  if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(Lhs)) {
-    if (Instruction *Res = foldICmpWithMinMaxImpl(Cmp, MinMax, Rhs, Pred))
-      return Res;
-  }
-
-  if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(Rhs)) {
-    if (Instruction *Res = foldICmpWithMinMaxImpl(
-            Cmp, MinMax, Lhs, ICmpInst::getSwappedPredicate(Pred)))
-      return Res;
-  }
-
-  return nullptr;
-}
 
 // Canonicalize checking for a power-of-2-or-zero value:
 static Instruction *foldICmpPow2Test(ICmpInst &I,
@@ -6853,6 +6835,34 @@ static Instruction *foldReductionIdiom(ICmpInst &I,
   return nullptr;
 }
 
+// This helper will be called with icmp operands in both orders.
+Instruction *InstCombinerImpl::foldICmpCommutative(ICmpInst::Predicate Pred,
+                                                   Value *Op0, Value *Op1,
+                                                   ICmpInst &CxtI) {
+  // Try to optimize 'icmp GEP, P' or 'icmp P, GEP'.
+  if (auto *GEP = dyn_cast<GEPOperator>(Op0))
+    if (Instruction *NI = foldGEPICmp(GEP, Op1, Pred, CxtI))
+      return NI;
+
+  if (auto *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *NI = foldSelectICmp(Pred, SI, Op1, CxtI))
+      return NI;
+
+  if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op0))
+    if (Instruction *Res = foldICmpWithMinMax(CxtI, MinMax, Op1, Pred))
+      return Res;
+
+  {
+    Value *X;
+    const APInt *C;
+    // icmp X+Cst, X
+    if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
+      return foldICmpAddOpConst(X, *C, Pred);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
@@ -6976,20 +6986,11 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
     return Res;
 
-  // Try to optimize 'icmp GEP, P' or 'icmp P, GEP'.
-  if (auto *GEP = dyn_cast<GEPOperator>(Op0))
-    if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
-      return NI;
-  if (auto *GEP = dyn_cast<GEPOperator>(Op1))
-    if (Instruction *NI = foldGEPICmp(GEP, Op0, I.getSwappedPredicate(), I))
-      return NI;
-
-  if (auto *SI = dyn_cast<SelectInst>(Op0))
-    if (Instruction *NI = foldSelectICmp(I.getPredicate(), SI, Op1, I))
-      return NI;
-  if (auto *SI = dyn_cast<SelectInst>(Op1))
-    if (Instruction *NI = foldSelectICmp(I.getSwappedPredicate(), SI, Op0, I))
-      return NI;
+  if (Instruction *Res = foldICmpCommutative(I.getPredicate(), Op0, Op1, I))
+    return Res;
+  if (Instruction *Res =
+          foldICmpCommutative(I.getSwappedPredicate(), Op1, Op0, I))
+    return Res;
 
   // In case of a comparison with two select instructions having the same
   // condition, check whether one of the resulting branches can be simplified.
@@ -7040,9 +7041,6 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *R = foldICmpWithCastOp(I))
     return R;
 
-  if (Instruction *Res = foldICmpWithMinMax(I))
-    return Res;
-
   {
     Value *X, *Y;
     // Transform (X & ~Y) == 0 --> (X & Y) != 0
@@ -7144,18 +7142,6 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
             !ACXI->isWeak())
           return ExtractValueInst::Create(ACXI, 1);
 
-  {
-    Value *X;
-    const APInt *C;
-    // icmp X+Cst, X
-    if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
-      return foldICmpAddOpConst(X, *C, I.getPredicate());
-
-    // icmp X, X+Cst
-    if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X)
-      return foldICmpAddOpConst(X, *C, I.getSwappedPredicate());
-  }
-
   if (Instruction *Res = foldICmpWithHighBitMask(I, Builder))
     return Res;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9e76a0cf17b18..bdaf7550b4b42 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -648,9 +648,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *foldICmpInstWithConstantAllowUndef(ICmpInst &Cmp,
                                                   const APInt &C);
   Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
-  Instruction *foldICmpWithMinMaxImpl(Instruction &I, MinMaxIntrinsic *MinMax,
-                                      Value *Z, ICmpInst::Predicate Pred);
-  Instruction *foldICmpWithMinMax(ICmpInst &Cmp);
+  Instruction *foldICmpWithMinMax(Instruction &I, MinMaxIntrinsic *MinMax,
+                                  Value *Z, ICmpInst::Predicate Pred);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
   Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
   Instruction *foldSignBitTest(ICmpInst &I);
@@ -708,6 +707,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                                const APInt &C);
   Instruction *foldICmpBitCast(ICmpInst &Cmp);
   Instruction *foldICmpWithTrunc(ICmpInst &Cmp);
+  Instruction *foldICmpCommutative(ICmpInst::Predicate Pred, Value *Op0,
+                                   Value *Op1, ICmpInst &CxtI);
 
   // Helpers of visitSelectInst().
   Instruction *foldSelectOfBools(SelectInst &SI);

From 795c989c387970578c89ef038d5432583510d32f Mon Sep 17 00:00:00 2001
From: Simon Camphausen <simon.camphausen@iml.fraunhofer.de>
Date: Tue, 2 Jan 2024 16:53:36 +0100
Subject: [PATCH 034/313] [mlir][EmitC] Disallow string attributes as initial
 values (#75310)

---
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp      | 58 ++++++++++++++----------
 mlir/test/Dialect/EmitC/invalid_ops.mlir | 22 ++++++---
 mlir/test/Target/Cpp/const.mlir          |  6 +--
 3 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index fd32efe783bcf..8508d29d2306a 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -50,6 +50,32 @@ void mlir::emitc::buildTerminatedBody(OpBuilder &builder, Location loc) {
   builder.create<emitc::YieldOp>(loc);
 }
 
+/// Check that the type of the initial value is compatible with the operations
+/// result type.
+static LogicalResult verifyInitializationAttribute(Operation *op,
+                                                   Attribute value) {
+  assert(op->getNumResults() == 1 && "operation must have 1 result");
+
+  if (llvm::isa<emitc::OpaqueAttr>(value))
+    return success();
+
+  if (llvm::isa<StringAttr>(value))
+    return op->emitOpError()
+           << "string attributes are not supported, use #emitc.opaque instead";
+
+  Type resultType = op->getResult(0).getType();
+  Type attrType = cast<TypedAttr>(value).getType();
+
+  if (resultType != attrType)
+    return op->emitOpError()
+           << "requires attribute to either be an #emitc.opaque attribute or "
+              "it's type ("
+           << attrType << ") to match the op's result type (" << resultType
+           << ")";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // AddOp
 //===----------------------------------------------------------------------===//
@@ -169,21 +195,14 @@ LogicalResult emitc::CallOpaqueOp::verify() {
 // ConstantOp
 //===----------------------------------------------------------------------===//
 
-/// The constant op requires that the attribute's type matches the return type.
 LogicalResult emitc::ConstantOp::verify() {
-  if (llvm::isa<emitc::OpaqueAttr>(getValueAttr()))
-    return success();
-
-  // Value must not be empty
-  StringAttr strAttr = llvm::dyn_cast<StringAttr>(getValueAttr());
-  if (strAttr && strAttr.empty())
-    return emitOpError() << "value must not be empty";
-
-  auto value = cast<TypedAttr>(getValueAttr());
-  Type type = getType();
-  if (!llvm::isa<NoneType>(value.getType()) && type != value.getType())
-    return emitOpError() << "requires attribute's type (" << value.getType()
-                         << ") to match op's return type (" << type << ")";
+  Attribute value = getValueAttr();
+  if (failed(verifyInitializationAttribute(getOperation(), value)))
+    return failure();
+  if (auto opaqueValue = llvm::dyn_cast<emitc::OpaqueAttr>(value)) {
+    if (opaqueValue.getValue().empty())
+      return emitOpError() << "value must not be empty";
+  }
   return success();
 }
 
@@ -561,17 +580,8 @@ LogicalResult SubOp::verify() {
 // VariableOp
 //===----------------------------------------------------------------------===//
 
-/// The variable op requires that the attribute's type matches the return type.
 LogicalResult emitc::VariableOp::verify() {
-  if (llvm::isa<emitc::OpaqueAttr>(getValueAttr()))
-    return success();
-
-  auto value = cast<TypedAttr>(getValueAttr());
-  Type type = getType();
-  if (!llvm::isa<NoneType>(value.getType()) && type != value.getType())
-    return emitOpError() << "requires attribute's type (" << value.getType()
-                         << ") to match op's return type (" << type << ")";
-  return success();
+  return verifyInitializationAttribute(getOperation(), getValueAttr());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 6ad646d7c62f1..46eccb1c24eea 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -1,7 +1,15 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
+func.func @const_attribute_str() {
+    // expected-error @+1 {{'emitc.constant' op string attributes are not supported, use #emitc.opaque instead}}                 
+    %c0 = "emitc.constant"(){value = "NULL"} : () -> !emitc.ptr<i32>
+    return
+}
+
+// -----
+
 func.func @const_attribute_return_type_1() {
-    // expected-error @+1 {{'emitc.constant' op requires attribute's type ('i64') to match op's return type ('i32')}}
+    // expected-error @+1 {{'emitc.constant' op requires attribute to either be an #emitc.opaque attribute or it's type ('i64') to match the op's result type ('i32')}}
     %c0 = "emitc.constant"(){value = 42: i64} : () -> i32
     return
 }
@@ -9,8 +17,8 @@ func.func @const_attribute_return_type_1() {
 // -----
 
 func.func @const_attribute_return_type_2() {
-    // expected-error @+1 {{'emitc.constant' op requires attribute's type ('!emitc.opaque<"char">') to match op's return type ('!emitc.opaque<"mychar">')}}
-    %c0 = "emitc.constant"(){value = "CHAR_MIN" : !emitc.opaque<"char">} : () -> !emitc.opaque<"mychar">
+    // expected-error @+1 {{'emitc.constant' op attribute 'value' failed to satisfy constraint: An opaque attribute or TypedAttr instance}}
+    %c0 = "emitc.constant"(){value = unit} : () -> i32
     return
 }
 
@@ -18,7 +26,7 @@ func.func @const_attribute_return_type_2() {
 
 func.func @empty_constant() {
     // expected-error @+1 {{'emitc.constant' op value must not be empty}}
-    %c0 = "emitc.constant"(){value = ""} : () -> i32
+    %c0 = "emitc.constant"(){value = #emitc.opaque<"">} : () -> i32
     return
 }
 
@@ -98,7 +106,7 @@ func.func @illegal_operand() {
 // -----
 
 func.func @var_attribute_return_type_1() {
-    // expected-error @+1 {{'emitc.variable' op requires attribute's type ('i64') to match op's return type ('i32')}}
+    // expected-error @+1 {{'emitc.variable' op requires attribute to either be an #emitc.opaque attribute or it's type ('i64') to match the op's result type ('i32')}}
     %c0 = "emitc.variable"(){value = 42: i64} : () -> i32
     return
 }
@@ -106,8 +114,8 @@ func.func @var_attribute_return_type_1() {
 // -----
 
 func.func @var_attribute_return_type_2() {
-    // expected-error @+1 {{'emitc.variable' op requires attribute's type ('!emitc.ptr<i64>') to match op's return type ('!emitc.ptr<i32>')}}
-    %c0 = "emitc.variable"(){value = "nullptr" : !emitc.ptr<i64>} : () -> !emitc.ptr<i32>
+    // expected-error @+1 {{'emitc.variable' op attribute 'value' failed to satisfy constraint: An opaque attribute or TypedAttr instance}}
+    %c0 = "emitc.variable"(){value = unit} : () -> i32
     return
 }
 
diff --git a/mlir/test/Target/Cpp/const.mlir b/mlir/test/Target/Cpp/const.mlir
index e6c94732e9f6b..28a547909a0ac 100644
--- a/mlir/test/Target/Cpp/const.mlir
+++ b/mlir/test/Target/Cpp/const.mlir
@@ -2,7 +2,7 @@
 // RUN: mlir-translate -mlir-to-cpp -declare-variables-at-top %s | FileCheck %s -check-prefix=CPP-DECLTOP
 
 func.func @emitc_constant() {
-  %c0 = "emitc.constant"(){value = #emitc.opaque<"">} : () -> i32
+  %c0 = "emitc.constant"(){value = #emitc.opaque<"INT_MAX">} : () -> i32
   %c1 = "emitc.constant"(){value = 42 : i32} : () -> i32
   %c2 = "emitc.constant"(){value = -1 : i32} : () -> i32
   %c3 = "emitc.constant"(){value = -1 : si8} : () -> si8
@@ -11,7 +11,7 @@ func.func @emitc_constant() {
   return
 }
 // CPP-DEFAULT: void emitc_constant() {
-// CPP-DEFAULT-NEXT: int32_t [[V0:[^ ]*]];
+// CPP-DEFAULT-NEXT: int32_t [[V0:[^ ]*]] = INT_MAX;
 // CPP-DEFAULT-NEXT: int32_t [[V1:[^ ]*]] = 42;
 // CPP-DEFAULT-NEXT: int32_t [[V2:[^ ]*]] = -1;
 // CPP-DEFAULT-NEXT: int8_t [[V3:[^ ]*]] = -1;
@@ -25,7 +25,7 @@ func.func @emitc_constant() {
 // CPP-DECLTOP-NEXT: int8_t [[V3:[^ ]*]];
 // CPP-DECLTOP-NEXT: uint8_t [[V4:[^ ]*]];
 // CPP-DECLTOP-NEXT: char [[V5:[^ ]*]];
-// CPP-DECLTOP-NEXT: ;
+// CPP-DECLTOP-NEXT: [[V0]] = INT_MAX;
 // CPP-DECLTOP-NEXT: [[V1]] = 42;
 // CPP-DECLTOP-NEXT: [[V2]] = -1;
 // CPP-DECLTOP-NEXT: [[V3]] = -1;

From fc9dbc999bc711a99b94b42453240b38a6509b0d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 8 Dec 2023 16:11:04 +0100
Subject: [PATCH 035/313] [InstCombine] Add extra test for icmp of gep fold
 (NFC)

---
 llvm/test/Transforms/InstCombine/icmp-gep.ll | 24 ++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index 99c784d15eb30..fb297e04ceb06 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -5,6 +5,7 @@ target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32:32-p3:64:64:64-i1:8:8-i8:
 
 declare ptr @getptr()
 declare void @use(ptr)
+declare void @use.i1(i1)
 
 define i1 @eq_base(ptr %x, i64 %y) {
 ; CHECK-LABEL: @eq_base(
@@ -443,6 +444,29 @@ define i1 @test60_extra_use_const_operands_no_inbounds(ptr %foo, i64 %i, i64 %j)
   ret i1 %cmp
 }
 
+define void @test60_extra_use_fold(ptr %foo, i64 %start.idx, i64 %end.offset) {
+; CHECK-LABEL: @test60_extra_use_fold(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[START_IDX:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 [[END_OFFSET:%.*]]
+; CHECK-NEXT:    call void @use(ptr [[GEP1]])
+; CHECK-NEXT:    call void @use(ptr [[GEP2]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    call void @use.i1(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    call void @use.i1(i1 [[CMP2]])
+; CHECK-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i32, ptr %foo, i64 %start.idx
+  %gep2 = getelementptr inbounds i8, ptr %foo, i64 %end.offset
+  call void @use(ptr %gep1)
+  call void @use(ptr %gep2)
+  %cmp1 = icmp eq ptr %gep1, %gep2
+  call void @use.i1(i1 %cmp1)
+  %cmp2 = icmp ult ptr %gep1, %gep2
+  call void @use.i1(i1 %cmp2)
+  ret void
+}
+
 define i1 @test_scalable_same(ptr %x) {
 ; CHECK-LABEL: @test_scalable_same(
 ; CHECK-NEXT:    ret i1 false

From 1bb85fa9c038044e949dac6e3b07e9835d1110a6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 2 Jan 2024 10:07:01 -0600
Subject: [PATCH 036/313] [libc] Lock the output stream for the 'puts' call
 (#76513)

Summary:
The `puts` function consists of an initial write and then another write
to append the newline. When executing code in parallel, it is possible
for these writes to becomes disjointed. This code adds an explicit lock
call to ensure that the string is always appended by the newline as the
users expects.

Wasn't sure if this required a test as it would be difficult since
reproducing it would be flaky.
---
 libc/src/stdio/generic/puts.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/libc/src/stdio/generic/puts.cpp b/libc/src/stdio/generic/puts.cpp
index d8d69332566cd..0f19ec5267671 100644
--- a/libc/src/stdio/generic/puts.cpp
+++ b/libc/src/stdio/generic/puts.cpp
@@ -15,9 +15,26 @@
 
 namespace LIBC_NAMESPACE {
 
+namespace {
+
+// Simple helper to unlock the file once destroyed.
+struct ScopedLock {
+  ScopedLock(LIBC_NAMESPACE::File *stream) : stream(stream) { stream->lock(); }
+  ~ScopedLock() { stream->unlock(); }
+
+private:
+  LIBC_NAMESPACE::File *stream;
+};
+
+} // namespace
+
 LLVM_LIBC_FUNCTION(int, puts, (const char *__restrict str)) {
   cpp::string_view str_view(str);
-  auto result = LIBC_NAMESPACE::stdout->write(str, str_view.size());
+
+  // We need to lock the stream to ensure the newline is always appended.
+  ScopedLock lock(LIBC_NAMESPACE::stdout);
+
+  auto result = LIBC_NAMESPACE::stdout->write_unlocked(str, str_view.size());
   if (result.has_error())
     libc_errno = result.error;
   size_t written = result.value;
@@ -25,7 +42,7 @@ LLVM_LIBC_FUNCTION(int, puts, (const char *__restrict str)) {
     // The stream should be in an error state in this case.
     return EOF;
   }
-  result = LIBC_NAMESPACE::stdout->write("\n", 1);
+  result = LIBC_NAMESPACE::stdout->write_unlocked("\n", 1);
   if (result.has_error())
     libc_errno = result.error;
   written = result.value;

From 2292fd0129362865d07777329fa38850d7a642a3 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@innosilicon.com>
Date: Tue, 2 Jan 2024 16:11:44 +0000
Subject: [PATCH 037/313] [mlir][spirv] Add support for C-API/python binding to
 SPIR-V dialect (#76055)

Enable bindings.

---------

Co-authored-by: jungpark-mlir <jungwook@jungwook-22.04>
---
 mlir/include/mlir-c/Dialect/SPIRV.h        | 26 ++++++++++++++++++++++
 mlir/lib/CAPI/Dialect/CMakeLists.txt       |  9 ++++++++
 mlir/lib/CAPI/Dialect/SPIRV.cpp            | 13 +++++++++++
 mlir/python/CMakeLists.txt                 |  7 ++++++
 mlir/python/mlir/dialects/SPIRVOps.td      | 14 ++++++++++++
 mlir/python/mlir/dialects/spirv.py         |  5 +++++
 mlir/test/python/dialects/spirv_dialect.py | 21 +++++++++++++++++
 7 files changed, 95 insertions(+)
 create mode 100644 mlir/include/mlir-c/Dialect/SPIRV.h
 create mode 100644 mlir/lib/CAPI/Dialect/SPIRV.cpp
 create mode 100644 mlir/python/mlir/dialects/SPIRVOps.td
 create mode 100644 mlir/python/mlir/dialects/spirv.py
 create mode 100644 mlir/test/python/dialects/spirv_dialect.py

diff --git a/mlir/include/mlir-c/Dialect/SPIRV.h b/mlir/include/mlir-c/Dialect/SPIRV.h
new file mode 100644
index 0000000000000..f22708c9db043
--- /dev/null
+++ b/mlir/include/mlir-c/Dialect/SPIRV.h
@@ -0,0 +1,26 @@
+//===-- mlir-c/Dialect/SPIRV.h - C API for SPIRV dialect ----------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_C_DIALECT_SPIRV_H
+#define MLIR_C_DIALECT_SPIRV_H
+
+#include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(SPIRV, spirv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MLIR_C_DIALECT_SPIRV_H
diff --git a/mlir/lib/CAPI/Dialect/CMakeLists.txt b/mlir/lib/CAPI/Dialect/CMakeLists.txt
index d815eba48d9b9..b2952da17a41c 100644
--- a/mlir/lib/CAPI/Dialect/CMakeLists.txt
+++ b/mlir/lib/CAPI/Dialect/CMakeLists.txt
@@ -171,6 +171,15 @@ add_mlir_upstream_c_api_library(MLIRCAPIFunc
   MLIRFuncDialect
 )
 
+add_mlir_upstream_c_api_library(MLIRCAPISPIRV
+  SPIRV.cpp
+
+  PARTIAL_SOURCES_INTENDED
+  LINK_LIBS PUBLIC
+  MLIRCAPIIR
+  MLIRSPIRVDialect
+)
+
 add_mlir_upstream_c_api_library(MLIRCAPITensor
   Tensor.cpp
 
diff --git a/mlir/lib/CAPI/Dialect/SPIRV.cpp b/mlir/lib/CAPI/Dialect/SPIRV.cpp
new file mode 100644
index 0000000000000..9bfe26b9598e2
--- /dev/null
+++ b/mlir/lib/CAPI/Dialect/SPIRV.cpp
@@ -0,0 +1,13 @@
+//===- SPIRV.cpp - C Interface for SPIRV dialect --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir-c/Dialect/SPIRV.h"
+#include "mlir/CAPI/Registration.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+
+MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(SPIRV, spirv, mlir::spirv::SPIRVDialect)
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 3c9cf304d88a2..266b86090fe17 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -378,6 +378,13 @@ declare_mlir_dialect_python_bindings(
     "../../include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td"
 )
 
+declare_mlir_dialect_python_bindings(
+    ADD_TO_PARENT MLIRPythonSources.Dialects
+    ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+    TD_FILE dialects/SPIRVOps.td
+    SOURCES dialects/spirv.py
+    DIALECT_NAME spirv)
+
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
diff --git a/mlir/python/mlir/dialects/SPIRVOps.td b/mlir/python/mlir/dialects/SPIRVOps.td
new file mode 100644
index 0000000000000..eaae0e609d623
--- /dev/null
+++ b/mlir/python/mlir/dialects/SPIRVOps.td
@@ -0,0 +1,14 @@
+//===-- SPIRVOps.td - Entry point for SPIRVOps bind --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PYTHON_BINDINGS_SPIRV_OPS
+#define PYTHON_BINDINGS_SPIRV_OPS
+
+include "mlir/Dialect/SPIRV/IR/SPIRVOps.td"
+
+#endif
diff --git a/mlir/python/mlir/dialects/spirv.py b/mlir/python/mlir/dialects/spirv.py
new file mode 100644
index 0000000000000..269678a2032eb
--- /dev/null
+++ b/mlir/python/mlir/dialects/spirv.py
@@ -0,0 +1,5 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from ._spirv_ops_gen import *
diff --git a/mlir/test/python/dialects/spirv_dialect.py b/mlir/test/python/dialects/spirv_dialect.py
new file mode 100644
index 0000000000000..d5b9e6cedb5d3
--- /dev/null
+++ b/mlir/test/python/dialects/spirv_dialect.py
@@ -0,0 +1,21 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+import mlir.dialects.spirv as spirv
+
+
+def run(f):
+    print("\nTEST:", f.__name__)
+    f()
+
+
+# CHECK-LABEL: TEST: testConstantOp
+@run
+def testConstantOps():
+    with Context() as ctx, Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            i32 = IntegerType.get_signless(32)
+            spirv.ConstantOp(value=IntegerAttr.get(i32, 42), constant=i32)
+        # CHECK: spirv.Constant 42 : i32
+        print(module)

From 4b9194952d73c34d4d58a5dc3aeddead130b5f0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett@gmail.com>
Date: Tue, 2 Jan 2024 17:26:39 +0100
Subject: [PATCH 038/313] [GlobalIsel]  Combine selects with constants (#76089)

A first small step at combining selects.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  18 +-
 .../include/llvm/Target/GlobalISel/Combine.td |  15 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 353 +++++++++++---
 .../AArch64/GlobalISel/combine-select.mir     | 246 ++++++++++
 llvm/test/CodeGen/AArch64/andcompare.ll       |   2 +-
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       |  67 +--
 llvm/test/CodeGen/AArch64/call-rv-marker.ll   | 447 ++++++++++++++++--
 .../combine-fold-binop-into-select.mir        |  28 +-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |   6 +-
 .../GlobalISel/divergence-structurizer.ll     |   6 +-
 llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll    |   6 +-
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |  46 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll         |  75 ++-
 llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll         | 137 +++---
 .../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll  |   8 +-
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           | 430 +++++++++--------
 16 files changed, 1406 insertions(+), 484 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index e7debc652a0a8..dcc1a4580b14a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -769,9 +769,6 @@ class CombinerHelper {
   bool matchCombineFSubFpExtFNegFMulToFMadOrFMA(MachineInstr &MI,
                                                 BuildFnTy &MatchInfo);
 
-  /// Fold boolean selects to logical operations.
-  bool matchSelectToLogical(MachineInstr &MI, BuildFnTy &MatchInfo);
-
   bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info);
 
   /// Transform G_ADD(x, G_SUB(y, x)) to y.
@@ -814,6 +811,9 @@ class CombinerHelper {
   // Given a binop \p MI, commute operands 1 and 2.
   void applyCommuteBinOpOperands(MachineInstr &MI);
 
+  /// Combine selects.
+  bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -904,6 +904,18 @@ class CombinerHelper {
   /// select (fcmp uge x, 1.0) 1.0, x -> fminnm x, 1.0
   bool matchFPSelectToMinMax(Register Dst, Register Cond, Register TrueVal,
                              Register FalseVal, BuildFnTy &MatchInfo);
+
+  /// Try to fold selects to logical operations.
+  bool tryFoldBoolSelectToLogic(GSelect *Select, BuildFnTy &MatchInfo);
+
+  bool tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo);
+
+  bool isOneOrOneSplat(Register Src, bool AllowUndefs);
+  bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
+  bool isConstantSplatVector(Register Src, int64_t SplatValue,
+                             bool AllowUndefs);
+
+  std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 77db371adaf77..6bda80681432a 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -437,13 +437,6 @@ def select_constant_cmp: GICombineRule<
   (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }])
 >;
 
-def select_to_logical : GICombineRule<
-  (defs root:$root, build_fn_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_SELECT):$root,
-    [{ return Helper.matchSelectToLogical(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
->;
-
 // Fold (C op x) -> (x op C)
 // TODO: handle more isCommutable opcodes
 // TODO: handle compares (currently not marked as isCommutable)
@@ -1242,6 +1235,12 @@ def select_to_minmax: GICombineRule<
          [{ return Helper.matchSimplifySelectToMinMax(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+def match_selects : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SELECT):$root,
+        [{ return Helper.matchSelect(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1282,7 +1281,7 @@ def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
 def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp,
-                                      select_to_logical]>;
+                                      match_selects]>;
 
 def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
                                        mul_by_neg_one, idempotent_prop]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 91a64d59e154d..8b15bdb0aca30 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5940,62 +5940,6 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
   return false;
 }
 
-bool CombinerHelper::matchSelectToLogical(MachineInstr &MI,
-                                          BuildFnTy &MatchInfo) {
-  GSelect &Sel = cast<GSelect>(MI);
-  Register DstReg = Sel.getReg(0);
-  Register Cond = Sel.getCondReg();
-  Register TrueReg = Sel.getTrueReg();
-  Register FalseReg = Sel.getFalseReg();
-
-  auto *TrueDef = getDefIgnoringCopies(TrueReg, MRI);
-  auto *FalseDef = getDefIgnoringCopies(FalseReg, MRI);
-
-  const LLT CondTy = MRI.getType(Cond);
-  const LLT OpTy = MRI.getType(TrueReg);
-  if (CondTy != OpTy || OpTy.getScalarSizeInBits() != 1)
-    return false;
-
-  // We have a boolean select.
-
-  // select Cond, Cond, F --> or Cond, F
-  // select Cond, 1, F    --> or Cond, F
-  auto MaybeCstTrue = isConstantOrConstantSplatVector(*TrueDef, MRI);
-  if (Cond == TrueReg || (MaybeCstTrue && MaybeCstTrue->isOne())) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildOr(DstReg, Cond, FalseReg);
-    };
-    return true;
-  }
-
-  // select Cond, T, Cond --> and Cond, T
-  // select Cond, T, 0    --> and Cond, T
-  auto MaybeCstFalse = isConstantOrConstantSplatVector(*FalseDef, MRI);
-  if (Cond == FalseReg || (MaybeCstFalse && MaybeCstFalse->isZero())) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildAnd(DstReg, Cond, TrueReg);
-    };
-    return true;
-  }
-
- // select Cond, T, 1 --> or (not Cond), T
-  if (MaybeCstFalse && MaybeCstFalse->isOne()) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildOr(DstReg, MIB.buildNot(OpTy, Cond), TrueReg);
-    };
-    return true;
-  }
-
-  // select Cond, 0, F --> and (not Cond), F
-  if (MaybeCstTrue && MaybeCstTrue->isZero()) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildAnd(DstReg, MIB.buildNot(OpTy, Cond), FalseReg);
-    };
-    return true;
-  }
-  return false;
-}
-
 bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
                                             unsigned &IdxToPropagate) {
   bool PropagateNaN;
@@ -6318,3 +6262,300 @@ void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
   MI.getOperand(2).setReg(LHSReg);
   Observer.changedInstr(MI);
 }
+
+bool CombinerHelper::isOneOrOneSplat(Register Src, bool AllowUndefs) {
+  LLT SrcTy = MRI.getType(Src);
+  if (SrcTy.isFixedVector())
+    return isConstantSplatVector(Src, 1, AllowUndefs);
+  if (SrcTy.isScalar()) {
+    if (AllowUndefs && getOpcodeDef<GImplicitDef>(Src, MRI) != nullptr)
+      return true;
+    auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+    return IConstant && IConstant->Value == 1;
+  }
+  return false; // scalable vector
+}
+
+bool CombinerHelper::isZeroOrZeroSplat(Register Src, bool AllowUndefs) {
+  LLT SrcTy = MRI.getType(Src);
+  if (SrcTy.isFixedVector())
+    return isConstantSplatVector(Src, 0, AllowUndefs);
+  if (SrcTy.isScalar()) {
+    if (AllowUndefs && getOpcodeDef<GImplicitDef>(Src, MRI) != nullptr)
+      return true;
+    auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+    return IConstant && IConstant->Value == 0;
+  }
+  return false; // scalable vector
+}
+
+// Ignores COPYs during conformance checks.
+// FIXME scalable vectors.
+bool CombinerHelper::isConstantSplatVector(Register Src, int64_t SplatValue,
+                                           bool AllowUndefs) {
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return false;
+  unsigned NumSources = BuildVector->getNumSources();
+
+  for (unsigned I = 0; I < NumSources; ++I) {
+    GImplicitDef *ImplicitDef =
+        getOpcodeDef<GImplicitDef>(BuildVector->getSourceReg(I), MRI);
+    if (ImplicitDef && AllowUndefs)
+      continue;
+    if (ImplicitDef && !AllowUndefs)
+      return false;
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (IConstant && IConstant->Value == SplatValue)
+      continue;
+    return false;
+  }
+  return true;
+}
+
+// Ignores COPYs during lookups.
+// FIXME scalable vectors
+std::optional<APInt>
+CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
+  auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+  if (IConstant)
+    return IConstant->Value;
+
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return std::nullopt;
+  unsigned NumSources = BuildVector->getNumSources();
+
+  std::optional<APInt> Value = std::nullopt;
+  for (unsigned I = 0; I < NumSources; ++I) {
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (!IConstant)
+      return std::nullopt;
+    if (!Value)
+      Value = IConstant->Value;
+    else if (*Value != IConstant->Value)
+      return std::nullopt;
+  }
+  return Value;
+}
+
+// TODO: use knownbits to determine zeros
+bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
+                                              BuildFnTy &MatchInfo) {
+  uint32_t Flags = Select->getFlags();
+  Register Dest = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  LLT TrueTy = MRI.getType(Select->getTrueReg());
+
+  // We only do this combine for scalar boolean conditions.
+  if (CondTy != LLT::scalar(1))
+    return false;
+
+  // Both are scalars.
+  std::optional<ValueAndVReg> TrueOpt =
+      getIConstantVRegValWithLookThrough(True, MRI);
+  std::optional<ValueAndVReg> FalseOpt =
+      getIConstantVRegValWithLookThrough(False, MRI);
+
+  if (!TrueOpt || !FalseOpt)
+    return false;
+
+  APInt TrueValue = TrueOpt->Value;
+  APInt FalseValue = FalseOpt->Value;
+
+  // select Cond, 1, 0 --> zext (Cond)
+  if (TrueValue.isOne() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildZExtOrTrunc(Dest, Cond);
+    };
+    return true;
+  }
+
+  // select Cond, -1, 0 --> sext (Cond)
+  if (TrueValue.isAllOnes() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildSExtOrTrunc(Dest, Cond);
+    };
+    return true;
+  }
+
+  // select Cond, 0, 1 --> zext (!Cond)
+  if (TrueValue.isZero() && FalseValue.isOne()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      B.buildZExtOrTrunc(Dest, Inner);
+    };
+    return true;
+  }
+
+  // select Cond, 0, -1 --> sext (!Cond)
+  if (TrueValue.isZero() && FalseValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      B.buildSExtOrTrunc(Dest, Inner);
+    };
+    return true;
+  }
+
+  // select Cond, C1, C1-1 --> add (zext Cond), C1-1
+  if (TrueValue - 1 == FalseValue) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Cond);
+      B.buildAdd(Dest, Inner, False);
+    };
+    return true;
+  }
+
+  // select Cond, C1, C1+1 --> add (sext Cond), C1+1
+  if (TrueValue + 1 == FalseValue) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Cond);
+      B.buildAdd(Dest, Inner, False);
+    };
+    return true;
+  }
+
+  // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
+  if (TrueValue.isPowerOf2() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Cond);
+      // The shift amount must be scalar.
+      LLT ShiftTy = TrueTy.isVector() ? TrueTy.getElementType() : TrueTy;
+      auto ShAmtC = B.buildConstant(ShiftTy, TrueValue.exactLogBase2());
+      B.buildShl(Dest, Inner, ShAmtC, Flags);
+    };
+    return true;
+  }
+  // select Cond, -1, C --> or (sext Cond), C
+  if (TrueValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Cond);
+      B.buildOr(Dest, Inner, False, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, C, -1 --> or (sext (not Cond)), C
+  if (FalseValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Not = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Not, Cond);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Not);
+      B.buildOr(Dest, Inner, True, Flags);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+// TODO: use knownbits to determine zeros
+bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
+                                              BuildFnTy &MatchInfo) {
+  uint32_t Flags = Select->getFlags();
+  Register DstReg = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  LLT TrueTy = MRI.getType(Select->getTrueReg());
+
+  // Boolean or fixed vector of booleans.
+  if (CondTy.isScalableVector() ||
+      (CondTy.isFixedVector() &&
+       CondTy.getElementType().getScalarSizeInBits() != 1) ||
+      CondTy.getScalarSizeInBits() != 1)
+    return false;
+
+  if (CondTy != TrueTy)
+    return false;
+
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
+  if ((Cond == True) || isOneOrOneSplat(True, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildOr(DstReg, Ext, False, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
+  if ((Cond == False) || isZeroOrZeroSplat(False, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildAnd(DstReg, Ext, True);
+    };
+    return true;
+  }
+
+  // select Cond, T, 1 --> or (not Cond), T
+  if (isOneOrOneSplat(False, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      // First the not.
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      // Then an ext to match the destination register.
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Inner);
+      B.buildOr(DstReg, Ext, True, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, 0, F --> and (not Cond), F
+  if (isZeroOrZeroSplat(True, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      // First the not.
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      // Then an ext to match the destination register.
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Inner);
+      B.buildAnd(DstReg, Ext, False);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  GSelect *Select = cast<GSelect>(&MI);
+
+  if (tryFoldSelectOfConstants(Select, MatchInfo))
+    return true;
+
+  if (tryFoldBoolSelectToLogic(Select, MatchInfo))
+    return true;
+
+  return false;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 81d38a5b08047..be2de620fa456 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -298,3 +298,249 @@ body:             |
     %ext:_(s32) = G_ANYEXT %sel
     $w0 = COPY %ext(s32)
 ...
+---
+# select cond, 1, 0 --> zext(Cond)
+name:            select_cond_1_0_to_zext_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_1_0_to_zext_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %c(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %one, %zero
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 0, 1 --> zext(!Cond)
+name:            select_cond_0_1_to_sext_not_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_1_to_sext_not_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT [[XOR]](s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %zero, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 2, 1 --> and (zext Cond), false
+name:            select_cond_2_1_to_and_zext_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_2_1_to_and_zext_cond_false
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[ZEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 102
+    %one:_(s8) = G_CONSTANT i8 101
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 1, 2 --> and (ext Cond), false
+name:            select_cond_1_2_to_and_sext_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_1_2_to_and_sext_cond_false
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 102
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[SEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 101
+    %one:_(s8) = G_CONSTANT i8 102
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 64, 0 --> (zext Cond) << log2(Pow2)
+name:            select_cond_64_0_to_shift
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_64_0_to_shift
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
+    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[ZEXT]], [[C]](s8)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 64
+    %one:_(s8) = G_CONSTANT i8 0
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, -1, 0 --> sext Cond
+name:            select_cond_minus_1_0_to_sext_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_minus_1_0_to_sext_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_SEXT %c(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 255
+    %one:_(s8) = G_CONSTANT i8 0
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 0, -1 --> sext (!Cond)
+name:            select_cond_0_minus_1_to_sext_not_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_minus_1_to_sext_not_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: %ext:_(s32) = G_SEXT [[XOR]](s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 0
+    %one:_(s8) = G_CONSTANT i8 255
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, -1, 101 --> or (sext Cond), 101
+name:            select_cond_minus_1_101_to_or_sext_cond_101
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_minus_1_101_to_or_sext_cond_101
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_OR [[SEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 255
+    %one:_(s8) = G_CONSTANT i8 101
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 101, -1 --> or (sext (not Cond), 101
+name:            select_cond_101_minus_1_to_or_sext_not_cond_101
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_101_minus_1_to_or_sext_not_cond_101
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %two:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT [[XOR]](s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_OR [[SEXT]], %two
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 101
+    %one:_(s8) = G_CONSTANT i8 255
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
diff --git a/llvm/test/CodeGen/AArch64/andcompare.ll b/llvm/test/CodeGen/AArch64/andcompare.ll
index 9a7fa04982990..cbacd17c846d4 100644
--- a/llvm/test/CodeGen/AArch64/andcompare.ll
+++ b/llvm/test/CodeGen/AArch64/andcompare.ll
@@ -2451,7 +2451,7 @@ define i32 @cmp_to_ands3(i32 %num, i32 %a) {
 ;
 ; GISEL-LABEL: cmp_to_ands3:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #23
+; GISEL-NEXT:    mov w8, #23 // =0x17
 ; GISEL-NEXT:    and w8, w0, w8
 ; GISEL-NEXT:    cmp w8, #7
 ; GISEL-NEXT:    csel w0, w1, wzr, hi
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index 821f6e403a271..446526986b883 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -14,7 +14,7 @@ define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB0_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -42,7 +42,7 @@ define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDISEL-NEXT:  LBB1_2: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_different:
@@ -55,7 +55,7 @@ define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; GISEL-NEXT:  LBB1_2: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, 5
@@ -88,7 +88,7 @@ define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDISEL-NEXT:  LBB2_3: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_flagclobber:
@@ -106,7 +106,7 @@ define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; GISEL-NEXT:  LBB2_3: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -144,7 +144,7 @@ define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB3_3: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -178,13 +178,13 @@ define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    ccmp w8, #16, #0, ge
 ; SDISEL-NEXT:    b.le LBB4_2
 ; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ; SDISEL-NEXT:  LBB4_2: ; %if.then
 ; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: speculate_division:
@@ -194,13 +194,13 @@ define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    ccmp w8, #17, #0, gt
 ; GISEL-NEXT:    b.lt LBB4_2
 ; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  LBB4_2: ; %if.then
 ; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -230,13 +230,13 @@ define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 ; SDISEL-NEXT:    fccmp s0, s1, #8, ge
 ; SDISEL-NEXT:    b.ge LBB5_2
 ; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ; SDISEL-NEXT:  LBB5_2: ; %if.then
 ; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_fcmp:
@@ -248,13 +248,13 @@ define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 ; GISEL-NEXT:    fccmp s0, s1, #8, gt
 ; GISEL-NEXT:    b.ge LBB5_2
 ; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  LBB5_2: ; %if.then
 ; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -318,7 +318,7 @@ define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB7_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -346,13 +346,13 @@ define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    cmp w1, #32
 ; CHECK-NEXT:    b.eq LBB8_3
 ; CHECK-NEXT:  ; %bb.2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB8_3: ; %if.then
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -380,7 +380,7 @@ define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB9_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -408,7 +408,7 @@ define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB10_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -466,7 +466,7 @@ define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_and:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #0, ne
 ; GISEL-NEXT:    csel x0, x2, x3, lt
@@ -488,7 +488,7 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #8, eq
 ; GISEL-NEXT:    csel x0, x2, x3, lt
@@ -510,7 +510,7 @@ define float @select_or_float(i32 %w0, i32 %w1, float %x2, float %x3) {
 ;
 ; GISEL-LABEL: select_or_float:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #8, eq
 ; GISEL-NEXT:    fcsel s0, s0, s1, lt
@@ -528,17 +528,22 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 ; SDISEL-NEXT:    cmp x0, #2
 ; SDISEL-NEXT:    ccmp x0, #4, #4, ne
 ; SDISEL-NEXT:    ccmp x1, #0, #0, eq
-; SDISEL-NEXT:    mov w8, #1
+; SDISEL-NEXT:    mov w8, #1 ; =0x1
 ; SDISEL-NEXT:    cinc x0, x8, eq
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: gccbug:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #2
+; GISEL-NEXT:    cmp x1, #0
+; GISEL-NEXT:    cset w8, eq
 ; GISEL-NEXT:    cmp x0, #2
-; GISEL-NEXT:    ccmp x0, #4, #4, ne
-; GISEL-NEXT:    ccmp x1, #0, #0, eq
-; GISEL-NEXT:    csinc x0, x8, xzr, eq
+; GISEL-NEXT:    cset w9, eq
+; GISEL-NEXT:    cmp x0, #4
+; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    orr w9, w10, w9
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    and x8, x8, #0x1
+; GISEL-NEXT:    add x0, x8, #1
 ; GISEL-NEXT:    ret
   %cmp0 = icmp eq i64 %x1, 0
   %cmp1 = icmp eq i64 %x0, 2
@@ -592,7 +597,7 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 ; SDISEL-LABEL: select_andor32:
 ; SDISEL:       ; %bb.0:
 ; SDISEL-NEXT:    cmp w1, w2
-; SDISEL-NEXT:    mov w8, #32
+; SDISEL-NEXT:    mov w8, #32 ; =0x20
 ; SDISEL-NEXT:    ccmp w0, w8, #4, lt
 ; SDISEL-NEXT:    ccmp w0, w1, #0, eq
 ; SDISEL-NEXT:    csel w0, w0, w1, eq
@@ -600,7 +605,7 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 ;
 ; GISEL-LABEL: select_andor32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #32
+; GISEL-NEXT:    mov w8, #32 ; =0x20
 ; GISEL-NEXT:    cmp w1, w2
 ; GISEL-NEXT:    ccmp w0, w8, #4, lt
 ; GISEL-NEXT:    ccmp w0, w1, #0, eq
@@ -701,11 +706,11 @@ define i32 @select_noccmp3(i32 %v0, i32 %v1, i32 %v2) {
 ; SDISEL-NEXT:    ccmp w0, #13, #0, ge
 ; SDISEL-NEXT:    cset w8, gt
 ; SDISEL-NEXT:    cmp w0, #22
-; SDISEL-NEXT:    mov w9, #44
+; SDISEL-NEXT:    mov w9, #44 ; =0x2c
 ; SDISEL-NEXT:    ccmp w0, w9, #0, ge
 ; SDISEL-NEXT:    csel w8, wzr, w8, le
 ; SDISEL-NEXT:    cmp w0, #99
-; SDISEL-NEXT:    mov w9, #77
+; SDISEL-NEXT:    mov w9, #77 ; =0x4d
 ; SDISEL-NEXT:    ccmp w0, w9, #4, ne
 ; SDISEL-NEXT:    cset w9, eq
 ; SDISEL-NEXT:    tst w8, w9
diff --git a/llvm/test/CodeGen/AArch64/call-rv-marker.ll b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
index fc06809ad09fb..de8f5bbfb484d 100644
--- a/llvm/test/CodeGen/AArch64/call-rv-marker.ll
+++ b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -o - %s | FileCheck --check-prefix=SELDAG --check-prefix=CHECK %s
 ; RUN: llc -global-isel -o - %s | FileCheck --check-prefix=GISEL --check-prefix=CHECK %s
 
@@ -25,37 +26,93 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 @fptr = dso_local global ptr null, align 8
 
 define dso_local ptr @rv_marker_1_retain() {
-; CHECK-LABEL: _rv_marker_1_retain:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_1_retain:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
 ;
+; GISEL-LABEL: rv_marker_1_retain:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   ret ptr %call
 }
 
 define dso_local ptr @rv_marker_1_unsafeClaim() {
-; CHECK-LABEL: _rv_marker_1_unsafeClaim:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_1_unsafeClaim:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
 ;
+; GISEL-LABEL: rv_marker_1_unsafeClaim:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_unsafeClaimAutoreleasedReturnValue) ]
   ret ptr %call
 }
 
 define dso_local void @rv_marker_2_select(i32 %c) {
-; CHECK-LABEL: _rv_marker_2_select:
-; SELDAG:        cinc  w0, w8, eq
-; GISEL:         csinc w0, w8, wzr, eq
-; CHECK-NEXT:    bl _foo0
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
-; CHECK-NEXT:    ldp x29, x30, [sp], #16
-; CHECK-NEXT:    b _foo2
+; SELDAG-LABEL: rv_marker_2_select:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    mov w8, #1 ; =0x1
+; SELDAG-NEXT:    cmp w0, #0
+; SELDAG-NEXT:    cinc w0, w8, eq
+; SELDAG-NEXT:    bl _foo0
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    b _foo2
 ;
+; GISEL-LABEL: rv_marker_2_select:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    mov w8, #1 ; =0x1
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cinc w0, w8, eq
+; GISEL-NEXT:    bl _foo0
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    b _foo2
 entry:
   %tobool.not = icmp eq i32 %c, 0
   %.sink = select i1 %tobool.not, i32 2, i32 1
@@ -65,11 +122,121 @@ entry:
 }
 
 define dso_local void @rv_marker_3() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: _rv_marker_3:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_3:
+; SELDAG:       Lfunc_begin0:
+; SELDAG-NEXT:    .cfi_startproc
+; SELDAG-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; SELDAG-NEXT:    .cfi_lsda 16, Lexception0
+; SELDAG-NEXT:  ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 32
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:  Ltmp0:
+; SELDAG-NEXT:    bl _objc_object
+; SELDAG-NEXT:  Ltmp1:
+; SELDAG-NEXT:  ; %bb.1: ; %invoke.cont
+; SELDAG-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; SELDAG-NEXT:    b _objc_release
+; SELDAG-NEXT:  LBB3_2: ; %lpad
+; SELDAG-NEXT:  Ltmp2:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    mov x0, x20
+; SELDAG-NEXT:    bl __Unwind_Resume
+; SELDAG-NEXT:  Lfunc_end0:
+; SELDAG-NEXT:    .cfi_endproc
+; SELDAG-NEXT:    .section __TEXT,__gcc_except_tab
+; SELDAG-NEXT:    .p2align 2, 0x0
+; SELDAG-NEXT:  GCC_except_table3:
+; SELDAG-NEXT:  Lexception0:
+; SELDAG-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; SELDAG-NEXT:    .byte 255 ; @TType Encoding = omit
+; SELDAG-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; SELDAG-NEXT:    .uleb128 Lcst_end0-Lcst_begin0
+; SELDAG-NEXT:  Lcst_begin0:
+; SELDAG-NEXT:    .uleb128 Lfunc_begin0-Lfunc_begin0 ; >> Call Site 1 <<
+; SELDAG-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; Call between Lfunc_begin0 and Ltmp0
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; >> Call Site 2 <<
+; SELDAG-NEXT:    .uleb128 Ltmp1-Ltmp0 ; Call between Ltmp0 and Ltmp1
+; SELDAG-NEXT:    .uleb128 Ltmp2-Lfunc_begin0 ; jumps to Ltmp2
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp1-Lfunc_begin0 ; >> Call Site 3 <<
+; SELDAG-NEXT:    .uleb128 Lfunc_end0-Ltmp1 ; Call between Ltmp1 and Lfunc_end0
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:  Lcst_end0:
+; SELDAG-NEXT:    .p2align 2, 0x0
 ;
+; GISEL-LABEL: rv_marker_3:
+; GISEL:       Lfunc_begin0:
+; GISEL-NEXT:    .cfi_startproc
+; GISEL-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; GISEL-NEXT:    .cfi_lsda 16, Lexception0
+; GISEL-NEXT:  ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:  Ltmp0:
+; GISEL-NEXT:    bl _objc_object
+; GISEL-NEXT:  Ltmp1:
+; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
+; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; GISEL-NEXT:    b _objc_release
+; GISEL-NEXT:  LBB3_2: ; %lpad
+; GISEL-NEXT:  Ltmp2:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    mov x0, x20
+; GISEL-NEXT:    bl __Unwind_Resume
+; GISEL-NEXT:  Lfunc_end0:
+; GISEL-NEXT:    .cfi_endproc
+; GISEL-NEXT:    .section __TEXT,__gcc_except_tab
+; GISEL-NEXT:    .p2align 2, 0x0
+; GISEL-NEXT:  GCC_except_table3:
+; GISEL-NEXT:  Lexception0:
+; GISEL-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; GISEL-NEXT:    .byte 255 ; @TType Encoding = omit
+; GISEL-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; GISEL-NEXT:    .uleb128 Lcst_end0-Lcst_begin0
+; GISEL-NEXT:  Lcst_begin0:
+; GISEL-NEXT:    .uleb128 Lfunc_begin0-Lfunc_begin0 ; >> Call Site 1 <<
+; GISEL-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; Call between Lfunc_begin0 and Ltmp0
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; >> Call Site 2 <<
+; GISEL-NEXT:    .uleb128 Ltmp1-Ltmp0 ; Call between Ltmp0 and Ltmp1
+; GISEL-NEXT:    .uleb128 Ltmp2-Lfunc_begin0 ; jumps to Ltmp2
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp1-Lfunc_begin0 ; >> Call Site 3 <<
+; GISEL-NEXT:    .uleb128 Lfunc_end0-Ltmp1 ; Call between Ltmp1 and Lfunc_end0
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:  Lcst_end0:
+; GISEL-NEXT:    .p2align 2, 0x0
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   invoke void @objc_object(ptr %call) #5
@@ -87,13 +254,151 @@ lpad:                                             ; preds = %entry
 }
 
 define dso_local void @rv_marker_4() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: _rv_marker_4:
-; CHECK:       Ltmp3:
-; CHECK-NEXT:    bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
-; CHECK-NEXT:  Ltmp4:
+; SELDAG-LABEL: rv_marker_4:
+; SELDAG:       Lfunc_begin1:
+; SELDAG-NEXT:    .cfi_startproc
+; SELDAG-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; SELDAG-NEXT:    .cfi_lsda 16, Lexception1
+; SELDAG-NEXT:  ; %bb.0: ; %entry
+; SELDAG-NEXT:    sub sp, sp, #48
+; SELDAG-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 48
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:  Ltmp3:
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:  Ltmp4:
+; SELDAG-NEXT:  ; %bb.1: ; %invoke.cont
+; SELDAG-NEXT:  Ltmp6:
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:    bl _objc_object
+; SELDAG-NEXT:  Ltmp7:
+; SELDAG-NEXT:  ; %bb.2: ; %invoke.cont2
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    add x0, sp, #15
+; SELDAG-NEXT:    bl __ZN1SD1Ev
+; SELDAG-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; SELDAG-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    add sp, sp, #48
+; SELDAG-NEXT:    ret
+; SELDAG-NEXT:  LBB4_3: ; %lpad1
+; SELDAG-NEXT:  Ltmp8:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    b LBB4_5
+; SELDAG-NEXT:  LBB4_4: ; %lpad
+; SELDAG-NEXT:  Ltmp5:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:  LBB4_5: ; %ehcleanup
+; SELDAG-NEXT:    add x0, sp, #15
+; SELDAG-NEXT:    bl __ZN1SD1Ev
+; SELDAG-NEXT:    mov x0, x20
+; SELDAG-NEXT:    bl __Unwind_Resume
+; SELDAG-NEXT:  Lfunc_end1:
+; SELDAG-NEXT:    .cfi_endproc
+; SELDAG-NEXT:    .section __TEXT,__gcc_except_tab
+; SELDAG-NEXT:    .p2align 2, 0x0
+; SELDAG-NEXT:  GCC_except_table4:
+; SELDAG-NEXT:  Lexception1:
+; SELDAG-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; SELDAG-NEXT:    .byte 255 ; @TType Encoding = omit
+; SELDAG-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; SELDAG-NEXT:    .uleb128 Lcst_end1-Lcst_begin1
+; SELDAG-NEXT:  Lcst_begin1:
+; SELDAG-NEXT:    .uleb128 Ltmp3-Lfunc_begin1 ; >> Call Site 1 <<
+; SELDAG-NEXT:    .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4
+; SELDAG-NEXT:    .uleb128 Ltmp5-Lfunc_begin1 ; jumps to Ltmp5
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp6-Lfunc_begin1 ; >> Call Site 2 <<
+; SELDAG-NEXT:    .uleb128 Ltmp7-Ltmp6 ; Call between Ltmp6 and Ltmp7
+; SELDAG-NEXT:    .uleb128 Ltmp8-Lfunc_begin1 ; jumps to Ltmp8
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp7-Lfunc_begin1 ; >> Call Site 3 <<
+; SELDAG-NEXT:    .uleb128 Lfunc_end1-Ltmp7 ; Call between Ltmp7 and Lfunc_end1
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:  Lcst_end1:
+; SELDAG-NEXT:    .p2align 2, 0x0
 ;
+; GISEL-LABEL: rv_marker_4:
+; GISEL:       Lfunc_begin1:
+; GISEL-NEXT:    .cfi_startproc
+; GISEL-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; GISEL-NEXT:    .cfi_lsda 16, Lexception1
+; GISEL-NEXT:  ; %bb.0: ; %entry
+; GISEL-NEXT:    sub sp, sp, #48
+; GISEL-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 48
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:  Ltmp3:
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:  Ltmp4:
+; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
+; GISEL-NEXT:  Ltmp6:
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    bl _objc_object
+; GISEL-NEXT:  Ltmp7:
+; GISEL-NEXT:  ; %bb.2: ; %invoke.cont2
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    add x0, sp, #15
+; GISEL-NEXT:    bl __ZN1SD1Ev
+; GISEL-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    add sp, sp, #48
+; GISEL-NEXT:    ret
+; GISEL-NEXT:  LBB4_3: ; %lpad1
+; GISEL-NEXT:  Ltmp8:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    b LBB4_5
+; GISEL-NEXT:  LBB4_4: ; %lpad
+; GISEL-NEXT:  Ltmp5:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:  LBB4_5: ; %ehcleanup
+; GISEL-NEXT:    add x0, sp, #15
+; GISEL-NEXT:    bl __ZN1SD1Ev
+; GISEL-NEXT:    mov x0, x20
+; GISEL-NEXT:    bl __Unwind_Resume
+; GISEL-NEXT:  Lfunc_end1:
+; GISEL-NEXT:    .cfi_endproc
+; GISEL-NEXT:    .section __TEXT,__gcc_except_tab
+; GISEL-NEXT:    .p2align 2, 0x0
+; GISEL-NEXT:  GCC_except_table4:
+; GISEL-NEXT:  Lexception1:
+; GISEL-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; GISEL-NEXT:    .byte 255 ; @TType Encoding = omit
+; GISEL-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; GISEL-NEXT:    .uleb128 Lcst_end1-Lcst_begin1
+; GISEL-NEXT:  Lcst_begin1:
+; GISEL-NEXT:    .uleb128 Ltmp3-Lfunc_begin1 ; >> Call Site 1 <<
+; GISEL-NEXT:    .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4
+; GISEL-NEXT:    .uleb128 Ltmp5-Lfunc_begin1 ; jumps to Ltmp5
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp6-Lfunc_begin1 ; >> Call Site 2 <<
+; GISEL-NEXT:    .uleb128 Ltmp7-Ltmp6 ; Call between Ltmp6 and Ltmp7
+; GISEL-NEXT:    .uleb128 Ltmp8-Lfunc_begin1 ; jumps to Ltmp8
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp7-Lfunc_begin1 ; >> Call Site 3 <<
+; GISEL-NEXT:    .uleb128 Lfunc_end1-Ltmp7 ; Call between Ltmp7 and Lfunc_end1
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:  Lcst_end1:
+; GISEL-NEXT:    .p2align 2, 0x0
 entry:
   %s = alloca %struct.S, align 1
   call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %s) #2
@@ -129,11 +434,53 @@ ehcleanup:                                        ; preds = %lpad1, %lpad
 }
 
 define dso_local ptr @rv_marker_5_indirect_call() {
-; CHECK-LABEL: _rv_marker_5_indirect_call:
-; CHECK:         ldr [[ADDR:x[0-9]+]], [
-; CHECK-NEXT:    blr [[ADDR]]
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_5_indirect_call:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 32
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:  Lloh0:
+; SELDAG-NEXT:    adrp x8, _fptr@PAGE
+; SELDAG-NEXT:  Lloh1:
+; SELDAG-NEXT:    ldr x8, [x8, _fptr@PAGEOFF]
+; SELDAG-NEXT:    blr x8
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:    bl _foo2
+; SELDAG-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
+; SELDAG-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+;
+; GISEL-LABEL: rv_marker_5_indirect_call:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:  Lloh0:
+; GISEL-NEXT:    adrp x8, _fptr@PAGE
+; GISEL-NEXT:  Lloh1:
+; GISEL-NEXT:    ldr x8, [x8, _fptr@PAGEOFF]
+; GISEL-NEXT:    blr x8
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    bl _foo2
+; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
+; GISEL-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 entry:
   %0 = load ptr, ptr @fptr, align 8
   %call = call ptr %0() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
@@ -144,13 +491,35 @@ entry:
 declare ptr @foo(i64, i64, i64)
 
 define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
-; CHECK-LABEL: _rv_marker_multiarg:
-; CHECK:         mov [[TMP:x[0-9]+]], x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x2, [[TMP]]
-; CHECK-NEXT:    bl  _foo
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_multiarg:
+; SELDAG:       ; %bb.0:
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    mov x8, x0
+; SELDAG-NEXT:    mov x0, x2
+; SELDAG-NEXT:    mov x2, x8
+; SELDAG-NEXT:    bl _foo
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
+;
+; GISEL-LABEL: rv_marker_multiarg:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    mov x3, x0
+; GISEL-NEXT:    mov x0, x2
+; GISEL-NEXT:    mov x2, x3
+; GISEL-NEXT:    bl _foo
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
   call ptr @foo(i64 %c, i64 %b, i64 %a) [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   ret void
 }
@@ -158,3 +527,5 @@ define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
 declare ptr @objc_retainAutoreleasedReturnValue(ptr)
 declare ptr @objc_unsafeClaimAutoreleasedReturnValue(ptr)
 declare i32 @__gxx_personality_v0(...)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
index 9f3ad8b444446..96a776f6fbb69 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
@@ -450,8 +450,9 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %and:_(s32) = G_SELECT %cond(s1), %zero, %variable
+    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(ne), %reg(s32), %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -476,7 +477,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %and:_(s32) = G_SELECT %cond(s1), %variable, %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -500,9 +502,9 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %or:_(s32) = G_SELECT %cond(s1), %variable, %neg1
+    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(ne), %reg(s32), %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -527,8 +529,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %or:_(s32) = G_SELECT %cond(s1), %neg1, %variable
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -667,9 +669,9 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
     ; CHECK-NEXT: %otherconst:_(s32) = G_CONSTANT i32 123
-    ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %neg1, %otherconst
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %select:_(s32) = G_OR [[SEXT]], %otherconst
     ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
@@ -749,8 +751,7 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %srem:_(s32) = G_SELECT %cond(s1), [[C]], %zero
+    ; CHECK-NEXT: %srem:_(s32) = G_ZEXT %cond(s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit %srem(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
@@ -802,8 +803,7 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %udiv:_(s32) = G_SELECT %cond(s1), [[C]], %zero
+    ; CHECK-NEXT: %udiv:_(s32) = G_ZEXT %cond(s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit %udiv(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index ccf4e84fbbbd1..4ac1fad6deecd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -37,7 +37,8 @@ define amdgpu_ps void @divergent_i1_phi_uniform_branch(ptr addrspace(1) %out, i3
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:    global_store_dword v[3:4], v5, off
 ; GFX10-NEXT:  .LBB0_3: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
@@ -72,7 +73,8 @@ define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(ptr addrspace(1) %
 ; GFX10-NEXT:  .LBB1_2: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  .LBB1_3: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index afd271c995770..c1f3924e466d5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -14,7 +14,8 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
@@ -51,7 +52,8 @@ define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  ; %bb.4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 794b10eea58b9..0cd409f726af2 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1517,7 +1517,8 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1558,7 +1559,8 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; IR-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; IR-IEEE-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; IR-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; IR-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 97216b6c94693..b516660f3bdc6 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -230,15 +230,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
 ; VI-SAFE-GISEL-NEXT:    s_max_i32 s7, s7, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s6, s2, s6
 ; VI-SAFE-GISEL-NEXT:    s_min_i32 s7, s7, 13
 ; VI-SAFE-GISEL-NEXT:    s_bitset1_b32 s2, 12
+; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
 ; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
+; VI-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
@@ -358,20 +359,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s7, s2, 0x1000
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
 ; GFX10-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s9, s4, 12
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
 ; GFX10-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s9
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s7, s6
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s7
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
@@ -497,24 +499,24 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s7, s2, 0x1000
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
 ; GFX11-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s9, s4, 12
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
 ; GFX11-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s9
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s7, s6
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index 046f262469695..31e481bf7aa4d 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -1850,7 +1850,8 @@ define float @v_sqrt_f32_ulp2(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1886,7 +1887,8 @@ define float @v_sqrt_f32_ulp25(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1922,7 +1924,8 @@ define float @v_sqrt_f32_ulp3(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1957,7 +1960,8 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, s[4:5]
@@ -2090,10 +2094,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2232,10 +2238,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[6:7], |v1|, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[6:7]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v1, |v1|, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2328,7 +2336,8 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -2425,7 +2434,8 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2509,7 +2519,8 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2589,7 +2600,8 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2658,10 +2670,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2802,10 +2816,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2929,10 +2945,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -3029,7 +3047,8 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x)
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3064,7 +3083,8 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub)
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3099,7 +3119,8 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3698,7 +3719,8 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3733,7 +3755,8 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3768,7 +3791,8 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3911,7 +3935,8 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 196a3705ac818..1a3d00211ca9b 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -40,8 +40,8 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -100,8 +100,8 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -161,8 +161,8 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -222,8 +222,8 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -284,8 +284,8 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -344,8 +344,8 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -404,8 +404,8 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -464,8 +464,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -533,8 +533,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -602,8 +602,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -671,8 +671,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -740,8 +740,8 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -800,8 +800,8 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -860,8 +860,8 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -920,8 +920,8 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -980,8 +980,8 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1062,10 +1062,11 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1139,8 +1140,8 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1199,8 +1200,8 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1260,8 +1261,8 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1342,10 +1343,11 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1419,8 +1421,8 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1479,8 +1481,8 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1539,8 +1541,8 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1599,8 +1601,8 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1681,10 +1683,11 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1796,16 +1799,18 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
 ; GISEL-NEXT:    v_mov_b32_e32 v6, s4
-; GISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v8, vcc
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[6:7]
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
@@ -1824,8 +1829,8 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
 ; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
-; GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
 ; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
 ; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index b3912aea55f79..fcc57b8bb7075 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -102,9 +102,9 @@ define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) {
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GISEL-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
@@ -164,8 +164,8 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace
 ; GISEL-NEXT:    s_add_u32 s0, s0, 1
 ; GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GISEL-NEXT:  .LBB5_2: ; %endif
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 3dc565ceed0d0..3ad98719c689c 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -62,12 +62,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -146,8 +146,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -242,12 +242,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -326,8 +326,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -423,12 +423,12 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -507,8 +507,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -603,12 +603,12 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -687,8 +687,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -784,12 +784,12 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -866,8 +866,8 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -953,12 +953,12 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -1035,8 +1035,8 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1123,12 +1123,12 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -1205,8 +1205,8 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1292,12 +1292,12 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -1374,8 +1374,8 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1461,12 +1461,12 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -1543,8 +1543,8 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1664,26 +1664,27 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1815,10 +1816,11 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1965,26 +1967,27 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2116,10 +2119,11 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2234,17 +2238,17 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2252,7 +2256,8 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2356,10 +2361,11 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2507,17 +2513,17 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2525,7 +2531,8 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2659,10 +2666,11 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2775,12 +2783,12 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -2857,8 +2865,8 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -2946,12 +2954,12 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -3028,8 +3036,8 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3107,12 +3115,12 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3177,8 +3185,8 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3252,12 +3260,12 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3322,8 +3330,8 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3398,12 +3406,12 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3470,8 +3478,8 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3546,12 +3554,12 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3616,8 +3624,8 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3691,12 +3699,12 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3761,8 +3769,8 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3836,12 +3844,12 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3906,8 +3914,8 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3982,12 +3990,12 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4054,8 +4062,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4138,12 +4146,12 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -4220,8 +4228,8 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4325,13 +4333,15 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -4339,30 +4349,29 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
@@ -4451,10 +4460,11 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -4550,12 +4560,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4622,8 +4632,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4706,12 +4716,12 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4776,8 +4786,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -5112,12 +5122,12 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5193,8 +5203,8 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5279,12 +5289,12 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5360,8 +5370,8 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5446,12 +5456,12 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5527,8 +5537,8 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5616,17 +5626,17 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    s_mov_b32 s6, 0
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0x40700000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -5702,10 +5712,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5

From 02c2bf8c054c8a425f7347a4a276e2dbf4b10e5a Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Tue, 2 Jan 2024 16:28:24 +0000
Subject: [PATCH 039/313] [RISCV] Change heuristic used for load clustering
 (#75341)

Split out from #73789, so as to leave that PR just for flipping load
clustering to on by default. Clusters if the operations are within a
cache line of each other (as AMDGPU does in shouldScheduleLoadsNear).
X86 does something similar, but does `((Offset2 - Offset1) / 8 > 64)`.
I'm not sure if that's intentionally set to 512 bytes or if the division
is in error.

Adopts the suggestion from @wangpc-pp to query the cache line size and
use it if available.

We also cap the maximum cluster size to cap the potential register
pressure impact (which may lead to additional spills).
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1dcff7eb563e2..cd98438eed882 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2282,9 +2282,14 @@ bool RISCVInstrInfo::shouldClusterMemOps(
     return false;
   }
 
-  // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
-  // indicate they likely share a cache line.
-  return ClusterSize <= 4;
+  unsigned CacheLineSize =
+      BaseOps1.front()->getParent()->getMF()->getSubtarget().getCacheLineSize();
+  // Assume a cache line size of 64 bytes if no size is set in RISCVSubtarget.
+  CacheLineSize = CacheLineSize ? CacheLineSize : 64;
+  // Cluster if the memory operations are on the same or a neighbouring cache
+  // line, but limit the maximum ClusterSize to avoid creating too much
+  // additional register pressure.
+  return ClusterSize <= 4 && std::abs(Offset1 - Offset2) < CacheLineSize;
 }
 
 // Set BaseReg (the base register operand), Offset (the byte offset being

From 571ad7324f3a25f507a1014a0467890f17772c13 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 08:29:06 -0800
Subject: [PATCH 040/313] [flang] Defer processing of non-pointer variable
 initializers (#76475)

Initializers in entity-decls don't need to have their expressions
analyzed immediately in name resolution unless of course they are
defining the values of named constants. By deferring the expression
analysis, the compiler can better handle references to module and
internal procedures that might appear in structure constructors; at
present, these are typically rejected as being forward references (which
they can be) to subprogram names that can't yet be checked for
compatibility with the characteristics of the corresponding procedure
component.
---
 flang/lib/Semantics/resolve-names.cpp     | 105 +++++++++++++++-------
 flang/test/Semantics/bad-forward-type.f90 |   1 -
 flang/test/Semantics/init01.f90           |   2 +-
 flang/test/Semantics/pointer01.f90        |   1 +
 flang/test/Semantics/symbol15.f90         |  24 ++---
 5 files changed, 87 insertions(+), 46 deletions(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index f5f7b99aba255..e30eb5070d789 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -698,7 +698,7 @@ class ScopeHandler : public ImplicitRulesVisitor {
   bool CheckPossibleBadForwardRef(const Symbol &);
 
   bool inSpecificationPart_{false};
-  bool inDataStmtObject_{false};
+  bool deferImplicitTyping_{false};
   bool inEquivalenceStmt_{false};
 
   // Some information is collected from a specification part for deferred
@@ -1629,6 +1629,7 @@ class ResolveNamesVisitor : public virtual ScopeHandler,
   bool BeginScopeForNode(const ProgramTree &);
   void EndScopeForNode(const ProgramTree &);
   void FinishSpecificationParts(const ProgramTree &);
+  void FinishExecutionParts(const ProgramTree &);
   void FinishDerivedTypeInstantiation(Scope &);
   void ResolveExecutionParts(const ProgramTree &);
   void UseCUDABuiltinNames();
@@ -2533,7 +2534,7 @@ void ScopeHandler::ApplyImplicitRules(
     // or object, it'll be caught later.
     return;
   }
-  if (inDataStmtObject_) {
+  if (deferImplicitTyping_) {
     return;
   }
   if (!context().HasError(symbol)) {
@@ -2709,7 +2710,7 @@ const DeclTypeSpec &ScopeHandler::MakeLogicalType(int kind) {
 }
 
 void ScopeHandler::NotePossibleBadForwardRef(const parser::Name &name) {
-  if (inSpecificationPart_ && !inDataStmtObject_ && name.symbol) {
+  if (inSpecificationPart_ && !deferImplicitTyping_ && name.symbol) {
     auto kind{currScope().kind()};
     if ((kind == Scope::Kind::Subprogram && !currScope().IsStmtFunction()) ||
         kind == Scope::Kind::BlockConstruct) {
@@ -6802,7 +6803,8 @@ bool ConstructVisitor::Pre(const parser::DataStmtObject &x) {
   auto flagRestorer{common::ScopedSet(inSpecificationPart_, false)};
   common::visit(common::visitors{
                     [&](const Indirection<parser::Variable> &y) {
-                      auto restorer{common::ScopedSet(inDataStmtObject_, true)};
+                      auto restorer{
+                          common::ScopedSet(deferImplicitTyping_, true)};
                       Walk(y.value());
                       const parser::Name &first{
                           parser::GetFirstName(y.value())};
@@ -7386,7 +7388,7 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) {
     }
     return &name;
   }
-  if (isImplicitNoneType() && !inDataStmtObject_) {
+  if (isImplicitNoneType() && !deferImplicitTyping_) {
     Say(name, "No explicit type declared for '%s'"_err_en_US);
     return nullptr;
   }
@@ -7548,7 +7550,15 @@ void DeclarationVisitor::Initialization(const parser::Name &name,
   common::visit(
       common::visitors{
           [&](const parser::ConstantExpr &expr) {
-            NonPointerInitialization(name, expr);
+            Walk(expr);
+            if (IsNamedConstant(ultimate) || inComponentDecl) {
+              NonPointerInitialization(name, expr);
+            } else {
+              // Defer analysis so forward references to nested subprograms
+              // can be properly resolved when they appear in structure
+              // constructors.
+              ultimate.set(Symbol::Flag::InDataStmt);
+            }
           },
           [&](const parser::NullInit &null) { // => NULL()
             Walk(null);
@@ -7569,10 +7579,12 @@ void DeclarationVisitor::Initialization(const parser::Name &name,
               }
             }
           },
-          [&](const parser::InitialDataTarget &) {
+          [&](const parser::InitialDataTarget &target) {
             // Defer analysis to the end of the specification part
             // so that forward references and attribute checks like SAVE
             // work better.
+            auto restorer{common::ScopedSet(deferImplicitTyping_, true)};
+            Walk(target);
             ultimate.set(Symbol::Flag::InDataStmt);
           },
           [&](const std::list<Indirection<parser::DataStmtValue>> &values) {
@@ -7590,12 +7602,27 @@ void DeclarationVisitor::PointerInitialization(
     Symbol &ultimate{name.symbol->GetUltimate()};
     if (!context().HasError(ultimate)) {
       if (IsPointer(ultimate)) {
-        if (auto *details{ultimate.detailsIf<ObjectEntityDetails>()}) {
-          CHECK(!details->init());
-          Walk(target);
-          if (MaybeExpr expr{EvaluateExpr(target)}) {
-            // Validation is done in declaration checking.
+        Walk(target);
+        if (MaybeExpr expr{EvaluateExpr(target)}) {
+          // Validation is done in declaration checking.
+          if (auto *details{ultimate.detailsIf<ObjectEntityDetails>()}) {
+            CHECK(!details->init());
             details->set_init(std::move(*expr));
+            ultimate.set(Symbol::Flag::InDataStmt, false);
+          } else if (auto *details{ultimate.detailsIf<ProcEntityDetails>()}) {
+            // something like "REAL, EXTERNAL, POINTER :: p => t"
+            if (evaluate::IsNullProcedurePointer(*expr)) {
+              CHECK(!details->init());
+              details->set_init(nullptr);
+            } else if (const Symbol *
+                targetSymbol{evaluate::UnwrapWholeSymbolDataRef(*expr)}) {
+              CHECK(!details->init());
+              details->set_init(*targetSymbol);
+            } else {
+              Say(name,
+                  "Procedure pointer '%s' must be initialized with a procedure name or NULL()"_err_en_US);
+              context().SetError(ultimate);
+            }
           }
         }
       } else {
@@ -7635,27 +7662,23 @@ void DeclarationVisitor::PointerInitialization(
 
 void DeclarationVisitor::NonPointerInitialization(
     const parser::Name &name, const parser::ConstantExpr &expr) {
-  if (name.symbol) {
+  if (!context().HasError(name.symbol)) {
     Symbol &ultimate{name.symbol->GetUltimate()};
-    if (!context().HasError(ultimate) && !context().HasError(name.symbol)) {
+    if (!context().HasError(ultimate)) {
       if (IsPointer(ultimate)) {
         Say(name,
             "'%s' is a pointer but is not initialized like one"_err_en_US);
       } else if (auto *details{ultimate.detailsIf<ObjectEntityDetails>()}) {
-        CHECK(!details->init());
-        if (IsAllocatable(ultimate)) {
+        if (details->init()) {
+        } else if (IsAllocatable(ultimate)) {
           Say(name, "Allocatable object '%s' cannot be initialized"_err_en_US);
-          return;
-        }
-        Walk(expr);
-        if (ultimate.owner().IsParameterizedDerivedType()) {
+        } else if (ultimate.owner().IsParameterizedDerivedType()) {
           // Save the expression for per-instantiation analysis.
           details->set_unanalyzedPDTComponentInit(&expr.thing.value());
-        } else {
-          if (MaybeExpr folded{EvaluateNonPointerInitializer(
-                  ultimate, expr, expr.thing.value().source)}) {
-            details->set_init(std::move(*folded));
-          }
+        } else if (MaybeExpr folded{EvaluateNonPointerInitializer(
+                       ultimate, expr, expr.thing.value().source)}) {
+          details->set_init(std::move(*folded));
+          ultimate.set(Symbol::Flag::InDataStmt, false);
         }
       } else {
         Say(name, "'%s' is not an object that can be initialized"_err_en_US);
@@ -8424,6 +8447,7 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) {
   ResolveSpecificationParts(root);
   FinishSpecificationParts(root);
   ResolveExecutionParts(root);
+  FinishExecutionParts(root);
   ResolveAccParts(context(), x);
   ResolveOmpParts(context(), x);
   return false;
@@ -8841,6 +8865,8 @@ class DeferredCheckVisitor {
     }
   }
 
+  bool Pre(const parser::BlockConstruct &x) { return true; }
+
   void Post(const parser::ProcInterface &pi) {
     if (const auto *name{std::get_if<parser::Name>(&pi.u)}) {
       resolver_.CheckExplicitInterface(*name);
@@ -8871,7 +8897,6 @@ class DeferredCheckVisitor {
       resolver_.CheckBindings(tbps);
     }
   }
-  bool Pre(const parser::StmtFunctionStmt &stmtFunc) { return false; }
 
 private:
   void Init(const parser::Name &name,
@@ -8880,6 +8905,9 @@ class DeferredCheckVisitor {
       if (const auto *target{
               std::get_if<parser::InitialDataTarget>(&init->u)}) {
         resolver_.PointerInitialization(name, *target);
+      } else if (const auto *expr{
+                     std::get_if<parser::ConstantExpr>(&init->u)}) {
+        resolver_.NonPointerInitialization(name, *expr);
       }
     }
   }
@@ -8894,15 +8922,16 @@ void ResolveNamesVisitor::FinishSpecificationParts(const ProgramTree &node) {
   if (!node.scope()) {
     return; // error occurred creating scope
   }
+  auto flagRestorer{common::ScopedSet(inSpecificationPart_, true)};
   SetScope(*node.scope());
-  // The initializers of pointers, the default initializers of pointer
-  // components, non-deferred type-bound procedure bindings have not
-  // yet been traversed.
-  // We do that now, when any (formerly) forward references that appear
+  // The initializers of pointers and non-PARAMETER objects, the default
+  // initializers of components, and non-deferred type-bound procedure
+  // bindings have not yet been traversed.
+  // We do that now, when any forward references that appeared
   // in those initializers will resolve to the right symbols without
-  // incurring spurious errors with IMPLICIT NONE.
+  // incurring spurious errors with IMPLICIT NONE or forward references
+  // to nested subprograms.
   DeferredCheckVisitor{*this}.Walk(node.spec());
-  DeferredCheckVisitor{*this}.Walk(node.exec()); // for BLOCK
   for (Scope &childScope : currScope().children()) {
     if (childScope.IsParameterizedDerivedTypeInstantiation()) {
       FinishDerivedTypeInstantiation(childScope);
@@ -8913,6 +8942,18 @@ void ResolveNamesVisitor::FinishSpecificationParts(const ProgramTree &node) {
   }
 }
 
+void ResolveNamesVisitor::FinishExecutionParts(const ProgramTree &node) {
+  if (node.scope()) {
+    SetScope(*node.scope());
+    if (node.exec()) {
+      DeferredCheckVisitor{*this}.Walk(*node.exec());
+    }
+    for (const auto &child : node.children()) {
+      FinishExecutionParts(child);
+    }
+  }
+}
+
 // Duplicate and fold component object pointer default initializer designators
 // using the actual type parameter values of each particular instantiation.
 // Validation is done later in declaration checking.
diff --git a/flang/test/Semantics/bad-forward-type.f90 b/flang/test/Semantics/bad-forward-type.f90
index 19e23e654642f..432d450a15f3f 100644
--- a/flang/test/Semantics/bad-forward-type.f90
+++ b/flang/test/Semantics/bad-forward-type.f90
@@ -84,7 +84,6 @@ subroutine s9
   type con
     Type(t(3)), pointer :: y
   end type
-  !ERROR: Cannot construct value for derived type 't' before it is defined
   Integer :: nn = Size(Transfer(t(3)(666),[0]))
   type :: t(n)
     integer, kind :: n = 3
diff --git a/flang/test/Semantics/init01.f90 b/flang/test/Semantics/init01.f90
index 9f75a8d556733..0f5a2144c79f9 100644
--- a/flang/test/Semantics/init01.f90
+++ b/flang/test/Semantics/init01.f90
@@ -90,7 +90,7 @@ subroutine components(n)
   real, pointer :: p10 => o3%x
   associate (a1 => o3, a2 => o3%x)
     block
-      real, pointer :: p11 => a1
+      type(t3), pointer :: p11 => a1
       real, pointer :: p12 => a2
     end block
   end associate
diff --git a/flang/test/Semantics/pointer01.f90 b/flang/test/Semantics/pointer01.f90
index cb860f3a3f437..9e87d1b689eb2 100644
--- a/flang/test/Semantics/pointer01.f90
+++ b/flang/test/Semantics/pointer01.f90
@@ -16,6 +16,7 @@ program main
   !ERROR: 'inner' cannot have the POINTER attribute
   pointer inner
   real obj
+  !ERROR: 'ip' is a pointer but is not initialized like one
   !ERROR: 'ip' may not have both the POINTER and PARAMETER attributes
   integer, parameter :: ip = 123
   pointer ip
diff --git a/flang/test/Semantics/symbol15.f90 b/flang/test/Semantics/symbol15.f90
index 318819e224cd8..97dc50a23845f 100644
--- a/flang/test/Semantics/symbol15.f90
+++ b/flang/test/Semantics/symbol15.f90
@@ -14,10 +14,10 @@ subroutine iface
  !DEF: /m/op2 POINTER, PUBLIC ObjectEntity REAL(4)
  !DEF: /m/null INTRINSIC, PUBLIC, PURE (Function) ProcEntity
  real, pointer :: op2 => null()
- !DEF: /m/op3 POINTER, PUBLIC (InDataStmt) ObjectEntity REAL(4)
+ !DEF: /m/op3 POINTER, PUBLIC ObjectEntity REAL(4)
  !DEF: /m/x PUBLIC, TARGET ObjectEntity REAL(4)
  real, pointer :: op3 => x
- !DEF: /m/op4 POINTER, PUBLIC (InDataStmt) ObjectEntity REAL(4)
+ !DEF: /m/op4 POINTER, PUBLIC ObjectEntity REAL(4)
  !DEF: /m/y PUBLIC, TARGET ObjectEntity REAL(4)
  real, pointer :: op4 => y(1)
  !REF: /m/iface
@@ -50,10 +50,10 @@ subroutine iface
   !DEF: /m/t1/opc2 POINTER ObjectEntity REAL(4)
   !REF: /m/null
   real, pointer :: opc2 => null()
-  !DEF: /m/t1/opc3 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/t1/opc3 POINTER ObjectEntity REAL(4)
   !REF: /m/x
   real, pointer :: opc3 => x
-  !DEF: /m/t1/opc4 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/t1/opc4 POINTER ObjectEntity REAL(4)
   !REF: /m/y
   real, pointer :: opc4 => y(1)
   !REF: /m/iface
@@ -100,10 +100,10 @@ subroutine iface
   !DEF: /m/pdt1/opc2 POINTER ObjectEntity REAL(4)
   !REF: /m/null
   real, pointer :: opc2 => null()
-  !DEF: /m/pdt1/opc3 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/pdt1/opc3 POINTER ObjectEntity REAL(4)
   !REF: /m/x
   real, pointer :: opc3 => x
-  !DEF: /m/pdt1/opc4 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/pdt1/opc4 POINTER ObjectEntity REAL(4)
   !REF: /m/y
   !REF: /m/pdt1/k
   real, pointer :: opc4 => y(k)
@@ -160,10 +160,10 @@ subroutine iface
   subroutine ext2
   end subroutine
  end interface
- !DEF: /m/op10 POINTER, PUBLIC(InDataStmt) ObjectEntity REAL(4)
+ !DEF: /m/op10 POINTER, PUBLIC ObjectEntity REAL(4)
  !REF: /m/x
  real, pointer :: op10 => x
- !DEF: /m/op11 POINTER, PUBLIC(InDataStmt) ObjectEntity REAL(4)
+ !DEF: /m/op11 POINTER, PUBLIC ObjectEntity REAL(4)
  !REF: /m/y
  real, pointer :: op11 => y(1)
  !REF: /m/iface
@@ -176,10 +176,10 @@ subroutine ext2
  procedure(iface), pointer :: pp11 => ext2
  !DEF: /m/t2 PUBLIC DerivedType
  type :: t2
-  !DEF: /m/t2/opc10 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/t2/opc10 POINTER ObjectEntity REAL(4)
   !REF: /m/x
   real, pointer :: opc10 => x
-  !DEF: /m/t2/opc11 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/t2/opc11 POINTER ObjectEntity REAL(4)
   !REF: /m/y
   real, pointer :: opc11 => y(1)
   !REF: /m/iface
@@ -203,10 +203,10 @@ subroutine ext2
  type :: pdt2(k)
   !REF: /m/pdt2/k
   integer, kind :: k
-  !DEF: /m/pdt2/opc10 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/pdt2/opc10 POINTER ObjectEntity REAL(4)
   !REF: /m/x
   real, pointer :: opc10 => x
-  !DEF: /m/pdt2/opc11 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/pdt2/opc11 POINTER ObjectEntity REAL(4)
   !REF: /m/y
   !REF: /m/pdt2/k
   real, pointer :: opc11 => y(k)

From 120ad2508af8b5093f5d9d9f5e7566936320e769 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 08:42:10 -0800
Subject: [PATCH 041/313] [flang][runtime] Extension: NAMELIST input may omit
 terminal '/' (#76476)

... when it is followed eventually by the '&' that begins the next
NAMELIST input group. This is a gfortran extension.
---
 flang/docs/Extensions.md     |  3 +++
 flang/runtime/edit-input.cpp |  7 ++++++-
 flang/runtime/io-stmt.cpp    |  6 ++++++
 flang/runtime/namelist.cpp   | 25 +++++++++++++++----------
 4 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 6c6588025a392..ab040b61703c8 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -315,6 +315,9 @@ end
 * When a file included via an `INCLUDE` line or `#include` directive
   has a continuation marker at the end of its last line in free form,
   Fortran line continuation works.
+* A `NAMELIST` input group may omit its trailing `/` character if
+  it is followed by another `NAMELIST` input group.
+* A `NAMELIST` input group may begin with either `&` or `$`.
 
 ### Extensions supported when enabled by options
 
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index c4fa186e289db..0fa6368ee591c 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -21,7 +21,8 @@ namespace Fortran::runtime::io {
 static inline bool IsCharValueSeparator(const DataEdit &edit, char32_t ch) {
   char32_t comma{
       edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','}};
-  return ch == ' ' || ch == '\t' || ch == '/' || ch == comma;
+  return ch == ' ' || ch == '\t' || ch == comma || ch == '/' ||
+      (edit.IsNamelist() && (ch == '&' || ch == '$'));
 }
 
 static bool CheckCompleteListDirectedField(
@@ -917,6 +918,10 @@ static bool EditListDirectedCharacterInput(
     case '/':
       isSep = true;
       break;
+    case '&':
+    case '$':
+      isSep = edit.IsNamelist();
+      break;
     case ',':
       isSep = !(edit.modes.editingFlags & decimalComma);
       break;
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 921c6e625edb5..7052a6acf41ce 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -580,6 +580,12 @@ std::optional<char32_t> IoStatementState::NextInField(
         case '*':
         case '\n': // for stream access
           return std::nullopt;
+        case '&':
+        case '$':
+          if (edit.IsNamelist()) {
+            return std::nullopt;
+          }
+          break;
         case ',':
           if (!(edit.modes.editingFlags & decimalComma)) {
             return std::nullopt;
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index 61815a7cc8a40..d9908bf7089ac 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -82,7 +82,7 @@ bool IONAME(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
 
 static constexpr bool IsLegalIdStart(char32_t ch) {
   return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' ||
-      ch == '@' || ch == '$';
+      ch == '@';
 }
 
 static constexpr bool IsLegalIdChar(char32_t ch) {
@@ -378,12 +378,13 @@ static bool HandleComponent(IoStatementState &io, Descriptor &desc,
   return false;
 }
 
-// Advance to the terminal '/' of a namelist group.
+// Advance to the terminal '/' of a namelist group or leading '&'/'$'
+// of the next.
 static void SkipNamelistGroup(IoStatementState &io) {
   std::size_t byteCount{0};
   while (auto ch{io.GetNextNonBlank(byteCount)}) {
     io.HandleRelativePosition(byteCount);
-    if (*ch == '/') {
+    if (*ch == '/' || *ch == '&' || *ch == '$') {
       break;
     } else if (*ch == '\'' || *ch == '"') {
       // Skip quoted character literal
@@ -418,7 +419,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   std::size_t byteCount{0};
   while (true) {
     next = io.GetNextNonBlank(byteCount);
-    while (next && *next != '&') {
+    while (next && *next != '&' && *next != '$') {
       // Extension: comment lines without ! before namelist groups
       if (!io.AdvanceRecord()) {
         next.reset();
@@ -430,9 +431,10 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
       handler.SignalEnd();
       return false;
     }
-    if (*next != '&') {
+    if (*next != '&' && *next != '$') {
       handler.SignalError(
-          "NAMELIST input group does not begin with '&' (at '%lc')", *next);
+          "NAMELIST input group does not begin with '&' or '$' (at '%lc')",
+          *next);
       return false;
     }
     io.HandleRelativePosition(byteCount);
@@ -448,7 +450,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   // Read the group's items
   while (true) {
     next = io.GetNextNonBlank(byteCount);
-    if (!next || *next == '/') {
+    if (!next || *next == '/' || *next == '&' || *next == '$') {
       break;
     }
     if (!GetLowerCaseName(io, name, sizeof name)) {
@@ -540,12 +542,15 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
       io.HandleRelativePosition(byteCount);
     }
   }
-  if (!next || *next != '/') {
+  if (next && *next == '/') {
+    io.HandleRelativePosition(byteCount);
+  } else if (*next && (*next == '&' || *next == '$')) {
+    // stop at beginning of next group
+  } else {
     handler.SignalError(
         "No '/' found after NAMELIST group '%s'", group.groupName);
     return false;
   }
-  io.HandleRelativePosition(byteCount);
   return true;
 }
 
@@ -565,7 +570,7 @@ bool IsNamelistNameOrSlash(IoStatementState &io) {
           // TODO: how to deal with NaN(...) ambiguity?
           return ch && (*ch == '=' || *ch == '(' || *ch == '%');
         } else {
-          return *ch == '/';
+          return *ch == '/' || *ch == '&' || *ch == '$';
         }
       }
     }

From 3bbdbb22a50705a78ea2668d4ab227889cabdc84 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 08:54:10 -0800
Subject: [PATCH 042/313] [flang] Fix parsing time explosion (#76533)

When parsing a deeply-nested expression like
  A1(A2(A3(A4(A5(A6(...A99(i)...))))))
the parser can get into an exponential state due to the need to consider
the possibility that each "An(...)" might be the beginning of a
reference to a procedure component ("An(...)%PROC(...)") so that
alternative has to be attempted first before proceeding to try parsing
"An(...)" as a function reference or as an array element designator. The
parser for a structure component, which is used by the procedure
designator parser, was not protected with the usual failure memoization
technique, leading to exponentially bad behavior parsing a deeply-nested
expression. Fix by exploiting the instrumented() parser combinator so
that failed structure component parsers aren't repeated.

Fixes https://github.com/llvm/llvm-project/issues/76477.
---
 flang/lib/Parser/Fortran-parsers.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index c070bc1de3735..0dd95d69d3c66 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -1151,8 +1151,9 @@ TYPE_PARSER(construct<PartRef>(name,
 
 // R913 structure-component -> data-ref
 // The final part-ref in the data-ref is not allowed to have subscripts.
-TYPE_PARSER(construct<StructureComponent>(
-    construct<DataRef>(some(Parser<PartRef>{} / percentOrDot)), name))
+TYPE_CONTEXT_PARSER("component"_en_US,
+    construct<StructureComponent>(
+        construct<DataRef>(some(Parser<PartRef>{} / percentOrDot)), name))
 
 // R919 subscript -> scalar-int-expr
 constexpr auto subscript{scalarIntExpr};

From b29d632eea48a14f46af2a9f04bd28798cb55612 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 09:04:26 -0800
Subject: [PATCH 043/313] [flang] Accept BIND(C) derived type for Cray pointees
 (#76538)

The compiler requires that a Cray pointee have a SEQUENCE type, but a
recent bug report points out that a BIND(C) type should also be
accepted.

Fixes https://github.com/llvm/llvm-project/issues/76529.
---
 flang/include/flang/Evaluate/tools.h  |  1 +
 flang/lib/Evaluate/tools.cpp          | 10 +++++++---
 flang/lib/Semantics/resolve-names.cpp | 10 ++++------
 flang/test/Semantics/resolve61.f90    |  7 ++++++-
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 8a47a9f651661..51414d61785f0 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1227,6 +1227,7 @@ bool IsFunctionResult(const Symbol &);
 bool IsKindTypeParameter(const Symbol &);
 bool IsLenTypeParameter(const Symbol &);
 bool IsExtensibleType(const DerivedTypeSpec *);
+bool IsSequenceOrBindCType(const DerivedTypeSpec *);
 bool IsBuiltinDerivedType(const DerivedTypeSpec *derived, const char *name);
 bool IsBuiltinCPtr(const Symbol &);
 bool IsEventType(const DerivedTypeSpec *);
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 8c755da4a2d8b..44a6fa4333cf3 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1724,9 +1724,13 @@ bool IsLenTypeParameter(const Symbol &symbol) {
 }
 
 bool IsExtensibleType(const DerivedTypeSpec *derived) {
-  return derived && !IsIsoCType(derived) &&
-      !derived->typeSymbol().attrs().test(Attr::BIND_C) &&
-      !derived->typeSymbol().get<DerivedTypeDetails>().sequence();
+  return !IsSequenceOrBindCType(derived) && !IsIsoCType(derived);
+}
+
+bool IsSequenceOrBindCType(const DerivedTypeSpec *derived) {
+  return derived &&
+      (derived->typeSymbol().attrs().test(Attr::BIND_C) ||
+          derived->typeSymbol().get<DerivedTypeDetails>().sequence());
 }
 
 bool IsBuiltinDerivedType(const DerivedTypeSpec *derived, const char *name) {
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e30eb5070d789..64fc7de120873 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5940,9 +5940,9 @@ void DeclarationVisitor::Post(const parser::BasedPointer &bp) {
     }
     if (const auto *pointeeType{pointee->GetType()}) {
       if (const auto *derived{pointeeType->AsDerived()}) {
-        if (!derived->typeSymbol().get<DerivedTypeDetails>().sequence()) {
+        if (!IsSequenceOrBindCType(derived)) {
           Say(pointeeName,
-              "Type of Cray pointee '%s' is a non-sequence derived type"_err_en_US);
+              "Type of Cray pointee '%s' is a derived type that is neither SEQUENCE nor BIND(C)"_err_en_US);
         }
       }
     }
@@ -6177,15 +6177,13 @@ void DeclarationVisitor::CheckCommonBlocks() {
         Say(name,
             "Unlimited polymorphic pointer '%s' may not appear in a COMMON block"_err_en_US);
       } else if (const auto *derived{type->AsDerived()}) {
-        auto &typeSymbol{derived->typeSymbol()};
-        if (!typeSymbol.attrs().test(Attr::BIND_C) &&
-            !typeSymbol.get<DerivedTypeDetails>().sequence()) {
+        if (!IsSequenceOrBindCType(derived)) {
           Say(name,
               "Derived type '%s' in COMMON block must have the BIND or"
               " SEQUENCE attribute"_err_en_US);
         }
         UnorderedSymbolSet typeSet;
-        CheckCommonBlockDerivedType(name, typeSymbol, typeSet);
+        CheckCommonBlockDerivedType(name, derived->typeSymbol(), typeSet);
       }
     }
   }
diff --git a/flang/test/Semantics/resolve61.f90 b/flang/test/Semantics/resolve61.f90
index 6728050243ec3..d6499f07b8609 100644
--- a/flang/test/Semantics/resolve61.f90
+++ b/flang/test/Semantics/resolve61.f90
@@ -107,11 +107,16 @@ subroutine p12
   type t2
     integer c2
   end type
+  type, bind(c) :: t3
+    integer c3
+  end type
   type(t1) :: x1
   type(t2) :: x2
+  type(t3) :: x3
   pointer(a, x1)
-  !ERROR: Type of Cray pointee 'x2' is a non-sequence derived type
+  !ERROR: Type of Cray pointee 'x2' is a derived type that is neither SEQUENCE nor BIND(C)
   pointer(b, x2)
+  pointer(c, x3)
 end
 
 subroutine p13

From 7c55dd8de64823deb71bbeff8543e31ab6264cd9 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 09:15:04 -0800
Subject: [PATCH 044/313] [flang] Accept multiple spaces after compiler
 directive sentinel (#76541)

The prescanner allows multiple spaces within a compiler directive, but
not between the directive's sentinel (e.g., !DIR$) and the directive's
first token.

Fixes https://github.com/llvm/llvm-project/issues/76537.
---
 flang/lib/Parser/prescan.cpp              | 8 +++++---
 flang/test/Parser/compiler-directives.f90 | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 79cdaccf1fbfe..68d7d9f0c53c4 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -139,16 +139,18 @@ void Prescanner::Statement() {
         SkipSpaces();
       }
     } else {
-      // Compiler directive.  Emit normalized sentinel.
+      // Compiler directive.  Emit normalized sentinel, squash following spaces.
       EmitChar(tokens, '!');
       ++at_, ++column_;
       for (const char *sp{directiveSentinel_}; *sp != '\0';
            ++sp, ++at_, ++column_) {
         EmitChar(tokens, *sp);
       }
-      if (*at_ == ' ') {
+      if (*at_ == ' ' || *at_ == '\t') {
         EmitChar(tokens, ' ');
-        ++at_, ++column_;
+        while (*at_ == ' ' || *at_ == '\t') {
+          ++at_, ++column_;
+        }
       }
       tokens.CloseToken();
     }
diff --git a/flang/test/Parser/compiler-directives.f90 b/flang/test/Parser/compiler-directives.f90
index 88cfd0944faf0..67e8d5b292aa0 100644
--- a/flang/test/Parser/compiler-directives.f90
+++ b/flang/test/Parser/compiler-directives.f90
@@ -17,6 +17,7 @@ module m
   !dir$ integer
   !dir$ integer=64
   !dir$ integer = 64
+  !dir$  integer = 64
   PROC(4)
   !dir$ optimize:1
   !dir$ optimize : 1

From 49ee8b53ef39c158d40d76128828379dd34ea61f Mon Sep 17 00:00:00 2001
From: SunilKuravinakop <98882378+SunilKuravinakop@users.noreply.github.com>
Date: Tue, 2 Jan 2024 22:46:02 +0530
Subject: [PATCH 045/313] [OpenMP] atomic compare fail : Codegen support
 (#75709)

This is a continuation of https://reviews.llvm.org/D123235 ([OpenMP]
atomic compare fail : Parser & AST support). In this branch Support for
codegen support for atomic compare fail is being added.

---------

Co-authored-by: Sunil Kuravinakop
---
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  61 +++++---
 clang/lib/Sema/SemaOpenMP.cpp                 |   2 +-
 clang/test/OpenMP/atomic_compare_codegen.cpp  | 144 ++++++++++++++++++
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   7 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  12 +-
 5 files changed, 203 insertions(+), 23 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ed426098ac691..e362c9da51fe3 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6406,13 +6406,11 @@ static void emitOMPAtomicCaptureExpr(CodeGenFunction &CGF,
   }
 }
 
-static void emitOMPAtomicCompareExpr(CodeGenFunction &CGF,
-                                     llvm::AtomicOrdering AO, const Expr *X,
-                                     const Expr *V, const Expr *R,
-                                     const Expr *E, const Expr *D,
-                                     const Expr *CE, bool IsXBinopExpr,
-                                     bool IsPostfixUpdate, bool IsFailOnly,
-                                     SourceLocation Loc) {
+static void emitOMPAtomicCompareExpr(
+    CodeGenFunction &CGF, llvm::AtomicOrdering AO, llvm::AtomicOrdering FailAO,
+    const Expr *X, const Expr *V, const Expr *R, const Expr *E, const Expr *D,
+    const Expr *CE, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly,
+    SourceLocation Loc) {
   llvm::OpenMPIRBuilder &OMPBuilder =
       CGF.CGM.getOpenMPRuntime().getOMPBuilder();
 
@@ -6477,13 +6475,21 @@ static void emitOMPAtomicCompareExpr(CodeGenFunction &CGF,
               R->getType().isVolatileQualified()};
   }
 
-  CGF.Builder.restoreIP(OMPBuilder.createAtomicCompare(
-      CGF.Builder, XOpVal, VOpVal, ROpVal, EVal, DVal, AO, Op, IsXBinopExpr,
-      IsPostfixUpdate, IsFailOnly));
+  if (FailAO == llvm::AtomicOrdering::NotAtomic) {
+    // fail clause was not mentionend on the
+    // "#pragma omp atomic compare" construct.
+    CGF.Builder.restoreIP(OMPBuilder.createAtomicCompare(
+        CGF.Builder, XOpVal, VOpVal, ROpVal, EVal, DVal, AO, Op, IsXBinopExpr,
+        IsPostfixUpdate, IsFailOnly));
+  } else
+    CGF.Builder.restoreIP(OMPBuilder.createAtomicCompare(
+        CGF.Builder, XOpVal, VOpVal, ROpVal, EVal, DVal, AO, Op, IsXBinopExpr,
+        IsPostfixUpdate, IsFailOnly, FailAO));
 }
 
 static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
-                              llvm::AtomicOrdering AO, bool IsPostfixUpdate,
+                              llvm::AtomicOrdering AO,
+                              llvm::AtomicOrdering FailAO, bool IsPostfixUpdate,
                               const Expr *X, const Expr *V, const Expr *R,
                               const Expr *E, const Expr *UE, const Expr *D,
                               const Expr *CE, bool IsXLHSInRHSPart,
@@ -6504,12 +6510,8 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
                              IsXLHSInRHSPart, Loc);
     break;
   case OMPC_compare: {
-    emitOMPAtomicCompareExpr(CGF, AO, X, V, R, E, D, CE, IsXLHSInRHSPart,
-                             IsPostfixUpdate, IsFailOnly, Loc);
-    break;
-  }
-  case OMPC_fail: {
-    //TODO
+    emitOMPAtomicCompareExpr(CGF, AO, FailAO, X, V, R, E, D, CE,
+                             IsXLHSInRHSPart, IsPostfixUpdate, IsFailOnly, Loc);
     break;
   }
   default:
@@ -6519,6 +6521,8 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
 
 void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
   llvm::AtomicOrdering AO = llvm::AtomicOrdering::Monotonic;
+  // Fail Memory Clause Ordering.
+  llvm::AtomicOrdering FailAO = llvm::AtomicOrdering::NotAtomic;
   bool MemOrderingSpecified = false;
   if (S.getSingleClause<OMPSeqCstClause>()) {
     AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -6572,12 +6576,27 @@ void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
     }
   }
 
+  if (KindsEncountered.contains(OMPC_compare) &&
+      KindsEncountered.contains(OMPC_fail)) {
+    Kind = OMPC_compare;
+    const auto *FailClause = S.getSingleClause<OMPFailClause>();
+    if (FailClause) {
+      OpenMPClauseKind FailParameter = FailClause->getFailParameter();
+      if (FailParameter == llvm::omp::OMPC_relaxed)
+        FailAO = llvm::AtomicOrdering::Monotonic;
+      else if (FailParameter == llvm::omp::OMPC_acquire)
+        FailAO = llvm::AtomicOrdering::Acquire;
+      else if (FailParameter == llvm::omp::OMPC_seq_cst)
+        FailAO = llvm::AtomicOrdering::SequentiallyConsistent;
+    }
+  }
+
   LexicalScope Scope(*this, S.getSourceRange());
   EmitStopPoint(S.getAssociatedStmt());
-  emitOMPAtomicExpr(*this, Kind, AO, S.isPostfixUpdate(), S.getX(), S.getV(),
-                    S.getR(), S.getExpr(), S.getUpdateExpr(), S.getD(),
-                    S.getCondExpr(), S.isXLHSInRHSPart(), S.isFailOnly(),
-                    S.getBeginLoc());
+  emitOMPAtomicExpr(*this, Kind, AO, FailAO, S.isPostfixUpdate(), S.getX(),
+                    S.getV(), S.getR(), S.getExpr(), S.getUpdateExpr(),
+                    S.getD(), S.getCondExpr(), S.isXLHSInRHSPart(),
+                    S.isFailOnly(), S.getBeginLoc());
 }
 
 static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 3826994ef2126..f34d2959dc619 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -12683,7 +12683,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
       break;
     }
     case OMPC_fail: {
-      if (AtomicKind != OMPC_compare) {
+      if (!EncounteredAtomicKinds.contains(OMPC_compare)) {
         Diag(C->getBeginLoc(), diag::err_omp_atomic_fail_no_compare)
             << SourceRange(C->getBeginLoc(), C->getEndLoc());
         return StmtError();
diff --git a/clang/test/OpenMP/atomic_compare_codegen.cpp b/clang/test/OpenMP/atomic_compare_codegen.cpp
index 43607cc65ad9b..03e5081a5c1d8 100644
--- a/clang/test/OpenMP/atomic_compare_codegen.cpp
+++ b/clang/test/OpenMP/atomic_compare_codegen.cpp
@@ -13806,6 +13806,64 @@ double dxevd() {
   return dv;
 }
 
+
+double fail_dxevd() {
+  double dx, dv, de, dd;
+
+#pragma omp atomic compare capture relaxed fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acquire fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture release fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acq_rel fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture seq_cst fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture relaxed fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acquire fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture release fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acq_rel fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture seq_cst fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture relaxed fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acquire fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture release fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acq_rel fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture seq_cst fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare seq_cst fail(acquire)
+  dx = dx < de ? de : dx;
+
+#pragma omp atomic compare relaxed fail(seq_cst)
+  dx = dx > de ? de : dx;
+
+  return dx;
+}
+
 #endif
 // CHECK-LABEL: @foo(
 // CHECK-NEXT:  entry:
@@ -61966,3 +62024,89 @@ double dxevd() {
 // SIMD-ONLY0-NEXT:    [[TMP180:%.*]] = load double, ptr [[DV]], align 8
 // SIMD-ONLY0-NEXT:    ret double [[TMP180]]
 //
+// CHECK-LABEL: {{.+}}fail_dxevd{{.+}}
+// CHECK-NEXT:  entry:
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:    {{.+}}cmpxchg ptr {{.+}} monotonic monotonic{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acquire monotonic{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} release monotonic{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acq_rel monotonic{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}}cmpxchg ptr {{.+}} seq_cst monotonic{{.+}}
+// CHECK:    {{.+}}__kmpc_flush{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} monotonic acquire{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acquire acquire{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} release acquire{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acq_rel acquire{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} seq_cst acquire{{.+}}
+// CHECK:    {{.+}}__kmpc_flush{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} monotonic seq_cst{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acquire seq_cst{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} release seq_cst{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acq_rel seq_cst{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} seq_cst seq_cst{{.+}}
+// CHECK:    call void {{.+}}__kmpc_flush{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} atomicrmw fmax {{.+}} seq_cst{{.+}}
+// CHECK-NEXT:  call void {{.+}}__kmpc_flush{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} atomicrmw fmin {{.+}} monotonic{{.+}}
+// CHECK:    ret double {{.+}}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index abbef03d02cb1..669104307fa0e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2562,6 +2562,13 @@ class OpenMPIRBuilder {
                       AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
                       AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
                       bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
+  InsertPointTy createAtomicCompare(const LocationDescription &Loc,
+                                    AtomicOpValue &X, AtomicOpValue &V,
+                                    AtomicOpValue &R, Value *E, Value *D,
+                                    AtomicOrdering AO,
+                                    omp::OMPAtomicCompareOp Op,
+                                    bool IsXBinopExpr, bool IsPostfixUpdate,
+                                    bool IsFailOnly, AtomicOrdering Failure);
 
   /// Create the control flow structure of a canonical OpenMP loop.
   ///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ce428f78dc843..f6cf358119fb7 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6026,6 +6026,17 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
     omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
     bool IsFailOnly) {
 
+  AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO);
+  return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
+                             IsPostfixUpdate, IsFailOnly, Failure);
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
+    const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
+    AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
+    omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
+    bool IsFailOnly, AtomicOrdering Failure) {
+
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -6040,7 +6051,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
   bool IsInteger = E->getType()->isIntegerTy();
 
   if (Op == OMPAtomicCompareOp::EQ) {
-    AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO);
     AtomicCmpXchgInst *Result = nullptr;
     if (!IsInteger) {
       IntegerType *IntCastTy =

From dea30aca3a56bb72d4e1eddb04f98c53bcb5992a Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 09:25:49 -0800
Subject: [PATCH 046/313] [flang][runtime] NAMELIST input into storage sequence
 (#76584)

Nearly every Fortran compiler supports the extension of NAMELIST input
into a storage sequence identified by its initial scalar array element.
For example,
  &GROUP A(1) = 1. 2. 3. /
should be processed as if the input had been
  &GROUP A(1:) = 1. 2. 3. /

Fixes llvm-test-suite/Fortran/gfortran/regression/namelist_24.f90.
---
 flang/docs/Extensions.md   |  4 ++++
 flang/runtime/namelist.cpp | 38 ++++++++++++++++++++++++++++++++++----
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index ab040b61703c8..da208f58da883 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -657,3 +657,7 @@ end
   but every Fortran compiler allows the encoding to be changed on an
   open unit.
 
+* A `NAMELIST` input item that references a scalar element of a vector
+  or contiguous array can be used as the initial element of a storage
+  sequence.  For example, "&GRP A(1)=1. 2. 3./" is treated as if had been
+  "&GRP A(1:)=1. 2. 3./".
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index d9908bf7089ac..e6997bcf945b8 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -247,6 +247,28 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
   return false;
 }
 
+static void StorageSequenceExtension(
+    Descriptor &desc, const Descriptor &source) {
+  // Support the near-universal extension of NAMELIST input into a
+  // designatable storage sequence identified by its initial scalar array
+  // element.  For example, treat "A(1) = 1. 2. 3." as if it had been
+  // "A(1:) = 1. 2. 3.".
+  if (desc.rank() == 0 && (source.rank() == 1 || source.IsContiguous())) {
+    if (auto stride{source.rank() == 1
+                ? source.GetDimension(0).ByteStride()
+                : static_cast<SubscriptValue>(source.ElementBytes())};
+        stride != 0) {
+      desc.raw().attribute = CFI_attribute_pointer;
+      desc.raw().rank = 1;
+      desc.GetDimension(0)
+          .SetBounds(1,
+              source.Elements() -
+                  ((source.OffsetElement() - desc.OffsetElement()) / stride))
+          .SetByteStride(stride);
+    }
+  }
+}
+
 static bool HandleSubstring(
     IoStatementState &io, Descriptor &desc, const char *name) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -480,10 +502,14 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
     bool hadSubscripts{false};
     bool hadSubstring{false};
     if (next && (*next == '(' || *next == '%')) {
+      const Descriptor *lastSubscriptBase{nullptr};
+      Descriptor *lastSubscriptDescriptor{nullptr};
       do {
         Descriptor &mutableDescriptor{staticDesc[whichStaticDesc].descriptor()};
         whichStaticDesc ^= 1;
         io.HandleRelativePosition(byteCount); // skip over '(' or '%'
+        lastSubscriptDescriptor = nullptr;
+        lastSubscriptBase = nullptr;
         if (*next == '(') {
           if (!hadSubstring && (hadSubscripts || useDescriptor->rank() == 0)) {
             mutableDescriptor = *useDescriptor;
@@ -497,11 +523,12 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
                                 "NAMELIST group '%s'",
                 name, group.groupName);
             return false;
+          } else if (HandleSubscripts(
+                         io, mutableDescriptor, *useDescriptor, name)) {
+            lastSubscriptBase = useDescriptor;
+            lastSubscriptDescriptor = &mutableDescriptor;
           } else {
-            if (!HandleSubscripts(
-                    io, mutableDescriptor, *useDescriptor, name)) {
-              return false;
-            }
+            return false;
           }
           hadSubscripts = true;
         } else {
@@ -514,6 +541,9 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
         useDescriptor = &mutableDescriptor;
         next = io.GetCurrentChar(byteCount);
       } while (next && (*next == '(' || *next == '%'));
+      if (lastSubscriptDescriptor) {
+        StorageSequenceExtension(*lastSubscriptDescriptor, *lastSubscriptBase);
+      }
     }
     // Skip the '='
     next = io.GetNextNonBlank(byteCount);

From 289eb995807116fedcec5c5614246330411f7b3b Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Tue, 2 Jan 2024 18:27:17 +0100
Subject: [PATCH 047/313] [mlir][bazel] Add SPIRV python binding

---
 .../mlir/python/BUILD.bazel                   | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 5882a311c9b6f..6f6f2b3798e85 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -1057,6 +1057,37 @@ filegroup(
     ],
 )
 
+##---------------------------------------------------------------------------##
+# SPIRV dialect.
+##---------------------------------------------------------------------------##
+
+gentbl_filegroup(
+    name = "SPIRVOpsPyGen",
+    tbl_outs = [
+        (
+            [
+                "-gen-python-op-bindings",
+                "-bind-dialect=spirv",
+            ],
+            "mlir/dialects/_spirv_ops_gen.py",
+        ),
+    ],
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "mlir/dialects/SPIRVOps.td",
+    deps = [
+        "//mlir:OpBaseTdFiles",
+        "//mlir:SPIRVOpsTdFiles",
+    ],
+)
+
+filegroup(
+    name = "SPIRVOpsPyFiles",
+    srcs = [
+        "mlir/dialects/spirv.py",
+        ":SPIRVOpsPyGen",
+    ],
+)
+
 ##---------------------------------------------------------------------------##
 # Tensor dialect.
 ##---------------------------------------------------------------------------##

From cab156c4129e5948a6322054480e66d3ca17b919 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 09:32:54 -0800
Subject: [PATCH 048/313] [flang][runtime] Don't round hexadecimal
 floating-point input (#76586)

Fortran 2023 subclause 13.7.2.3.8 discusses input rounding only in the
context of decimal-to-binary conversion. There is no mention of rounding
for hexadecimal floating-point input conversion. At least one Fortran
compiler seems to have interpreted this silence as implying no rounding.
(Note that this is not the same thing as rounding to zero (RZ), which
would return +/-HUGE() for overflow.)
---
 flang/docs/Extensions.md     |  6 ++++++
 flang/runtime/edit-input.cpp | 38 +++---------------------------------
 2 files changed, 9 insertions(+), 35 deletions(-)

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index da208f58da883..16eb67f2e27c8 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -648,6 +648,12 @@ end
   only in function references, but not an explicit `INTRINSIC` statement,
   its name is not brought into other scopes by a `USE` statement.
 
+* Should hexadecimal floating-point input editing apply any rounding?
+  F'2023 subclause 13.7.2.3.8 only discusses rounding in the context of
+  decimal-to-binary conversion; it would seem to not apply, and so
+  we don't round.  This seems to be how the Intel Fortran compilers
+  behave.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 0fa6368ee591c..0c2341a4dfac7 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -625,31 +625,6 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
       fraction <<= 1;
       --expo;
     }
-    // Rounding
-    bool increase{false};
-    switch (rounding) {
-    case decimal::RoundNearest: // RN & RP
-      increase = roundingBit && (guardBit | ((int)fraction & 1));
-      break;
-    case decimal::RoundUp: // RU
-      increase = !isNegative && (roundingBit | guardBit);
-      break;
-    case decimal::RoundDown: // RD
-      increase = isNegative && (roundingBit | guardBit);
-      break;
-    case decimal::RoundToZero: // RZ
-      break;
-    case decimal::RoundCompatible: // RC
-      increase = roundingBit != 0;
-      break;
-    }
-    if (increase) {
-      ++fraction;
-      if (fraction >> binaryPrecision) {
-        fraction >>= 1;
-        ++expo;
-      }
-    }
   }
   // Package & return result
   constexpr RawType significandMask{(one << RealType::significandBits) - 1};
@@ -660,16 +635,9 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
     expo = 0; // subnormal
     flags |= decimal::Underflow;
   } else if (expo >= RealType::maxExponent) {
-    if (rounding == decimal::RoundToZero ||
-        (rounding == decimal::RoundDown && !isNegative) ||
-        (rounding == decimal::RoundUp && isNegative)) {
-      expo = RealType::maxExponent - 1; // +/-HUGE()
-      fraction = significandMask;
-    } else {
-      expo = RealType::maxExponent; // +/-Inf
-      fraction = 0;
-      flags |= decimal::Overflow;
-    }
+    expo = RealType::maxExponent; // +/-Inf
+    fraction = 0;
+    flags |= decimal::Overflow;
   } else {
     fraction &= significandMask; // remove explicit normalization unless x87
   }

From 78348b691504bf9ec212add73cc37d2fd8371f83 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Tue, 2 Jan 2024 09:34:24 -0800
Subject: [PATCH 049/313] [mlir][tensor] Improve tensor.pack simplication
 pattern.  (#76606)

A tensor.pack op can be rewritten to a tensor.expand_shape op if the
packing only happens on inner most dimension.

This also formats the lit checks better.
---
 .../Transforms/PackAndUnpackPatterns.cpp      | 14 +++++-
 .../Dialect/Tensor/simplify-pack-unpack.mlir  | 50 ++++++++++++++++---
 2 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
index 67651a2e38c82..e20450c95ffd5 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
@@ -35,10 +35,20 @@ struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> {
 
   LogicalResult matchAndRewrite(PackOp packOp,
                                 PatternRewriter &rewriter) const override {
+    if (packOp.getPaddingValue())
+      return rewriter.notifyMatchFailure(packOp, "expects no padding value");
+
+    if (!packOp.getOuterDimsPerm().empty())
+      return rewriter.notifyMatchFailure(packOp, "expects no outer_dims_perm");
+
     RankedTensorType sourceType = packOp.getSourceType();
     RankedTensorType destType = packOp.getDestType();
-    if (sourceType.getRank() != 1 || packOp.getPaddingValue())
-      return failure();
+    ArrayRef<int64_t> dimsPos = packOp.getInnerDimsPos();
+    if (dimsPos.size() != 1 || (dimsPos[0] + 1 != sourceType.getRank())) {
+      return rewriter.notifyMatchFailure(
+          packOp, "expects packing at the innermost dimension");
+    }
+
     auto reassociation =
         getReassociationIndicesForReshape(sourceType, destType);
     if (!reassociation)
diff --git a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
index 049076a67bae5..bdfe18acd86c5 100644
--- a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt -split-input-file -test-tensor-transform-patterns="test-simplify-pack-unpack-patterns" %s | FileCheck %s
 
-// CHECK: func.func @single_dim_packing(
-// CHECK-SAME: %[[ARG0:.+]]: tensor<256xf32>)
-// CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1]] : tensor<256xf32> into tensor<8x32xf32>
-// CHECK: return %[[EXPANDED]] : tensor<8x32xf32>
+// CHECK-LABEL: func.func @single_dim_packing(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<256xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1]] : tensor<256xf32> into tensor<8x32xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<8x32xf32>
 func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> {
   %empty = tensor.empty() : tensor<8x32xf32>
   %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256xf32> -> tensor<8x32xf32>
@@ -12,13 +12,47 @@ func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> {
 
 // -----
 
-// CHECK: func.func @single_dim_packing_with_padding(
-// CHECK-SAME: %[[ARG0:.+]]: tensor<255xf32>)
-// CHECK-NOT: tensor.expand_shape
-// CHECK: tensor.pack
+// CHECK-LABEL: func.func @single_dim_packing_with_padding(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<255xf32>)
+// CHECK-NOT:     tensor.expand_shape
+// CHECK:         tensor.pack
 func.func @single_dim_packing_with_padding(%arg0: tensor<255xf32>) -> tensor<8x32xf32> {
   %empty = tensor.empty() : tensor<8x32xf32>
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.pack %arg0 padding_value(%cst : f32) inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<255xf32> -> tensor<8x32xf32>
   return %0 : tensor<8x32xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @single_last_inner_dim_packing(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<5x256xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<5x256xf32> into tensor<5x8x32xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<5x8x32xf32>
+func.func @single_last_inner_dim_packing(%arg0: tensor<5x256xf32>) -> tensor<5x8x32xf32> {
+  %empty = tensor.empty() : tensor<5x8x32xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32>
+  return %0 : tensor<5x8x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @packing_with_outer_dims_perm(
+// CHECK-NOT:     tensor.expand_shape
+// CHECK:         tensor.pack
+func.func @packing_with_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<8x5x32xf32> {
+  %empty = tensor.empty() : tensor<8x5x32xf32>
+  %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<8x5x32xf32>
+  return %0 : tensor<8x5x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @single_first_inner_dim_packing(
+// CHECK-NOT:     tensor.expand_shape
+// CHECK:         tensor.pack
+func.func @single_first_inner_dim_packing(%arg0: tensor<256x5xf32>) -> tensor<8x5x32xf32> {
+  %empty = tensor.empty() : tensor<8x5x32xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256x5xf32> -> tensor<8x5x32xf32>
+  return %0 : tensor<8x5x32xf32>
+}

From 4c1f488b78237e3388ac44d587b7b2e0c1d772b9 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 09:38:52 -0800
Subject: [PATCH 050/313] [flang] Fix folding of NEAREST(TINY(1.),-1.) (#76590)

The code to fold NEAREST would return a value that's too large when
transitioning from a normal number to a subnormal.

Fixes llvm-test-suite/Fortran/gfortran/regression/nearest_1.f90.
---
 flang/lib/Evaluate/real.cpp          | 2 +-
 flang/test/Evaluate/fold-nearest.f90 | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Evaluate/real.cpp b/flang/lib/Evaluate/real.cpp
index 4fecaa1a131f8..cb8be98bb6f40 100644
--- a/flang/lib/Evaluate/real.cpp
+++ b/flang/lib/Evaluate/real.cpp
@@ -350,7 +350,7 @@ ValueWithRealFlags<Real<W, P>> Real<W, P>::NEAREST(bool upward) const {
         isNegative = !isNegative;
       } else {
         auto sub1{fraction.SubtractSigned(one)};
-        if (sub1.overflow) {
+        if (sub1.overflow && expo > 1) {
           nearest = Fraction{0}.NOT();
           --expo;
         } else {
diff --git a/flang/test/Evaluate/fold-nearest.f90 b/flang/test/Evaluate/fold-nearest.f90
index 99af303128411..bd8b020c392ac 100644
--- a/flang/test/Evaluate/fold-nearest.f90
+++ b/flang/test/Evaluate/fold-nearest.f90
@@ -26,6 +26,8 @@ module m1
   logical, parameter :: test_14 = nearest(0., negZero) == -minSubnormal
   !WARN: warning: NEAREST: S argument is zero
   logical, parameter :: test_15 = nearest(negZero, 0.) == minSubnormal
+  logical, parameter :: test_16 = nearest(tiny(1.),-1.) == 1.1754942E-38
+  logical, parameter :: test_17 = nearest(tiny(1.),1.) == 1.1754945E-38
 end module
 
 module m2

From 8f3357b75b6f0093e3e5df8adb140c9dad24f881 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 09:44:16 -0800
Subject: [PATCH 051/313] [flang][runtime] Don't use -1 in I/O API for "default
 unit" (#76642)

The I/O runtime's API allows -1 to be passed for a unit number in a
READ, WRITE, or PRINT statement, where it gets replaced by 5 or 6 as
appropriate. This turns out to have been a bad idea, as it prevents the
I/O runtime from detecting and reporting a program's invalid attempt to
use -1 as an I/O unit number. So just pass 5 or 6 as appropriate.
---
 flang/include/flang/Runtime/io-api.h          | 28 +++++++-----
 flang/include/flang/Runtime/magic-numbers.h   |  4 ++
 flang/lib/Lower/IO.cpp                        | 27 ++++++-----
 flang/module/iso_fortran_env.f90              |  7 +--
 flang/runtime/io-api.cpp                      | 45 +++++++++++--------
 flang/runtime/unit.cpp                        | 10 +++--
 flang/test/Lower/HLFIR/calls-f77.f90          |  2 +-
 .../Lower/HLFIR/convert-mbox-to-value.f90     |  4 +-
 .../parallel-lastprivate-clause-scalar.f90    |  4 +-
 .../parallel-lastprivate-clause-scalar.f90    |  4 +-
 flang/test/Lower/array-character.f90          |  4 +-
 flang/test/Lower/array-expression-slice-1.f90 |  4 +-
 flang/test/Lower/array-expression.f90         |  2 +-
 flang/test/Lower/array-temp.f90               | 12 ++---
 flang/test/Lower/host-associated.f90          |  2 +-
 flang/test/Lower/io-statement-2.f90           |  4 +-
 flang/test/Lower/vector-subscript-io.f90      | 26 +++++------
 17 files changed, 107 insertions(+), 82 deletions(-)

diff --git a/flang/include/flang/Runtime/io-api.h b/flang/include/flang/Runtime/io-api.h
index 41574e3bb80ad..0277f0ea9e97e 100644
--- a/flang/include/flang/Runtime/io-api.h
+++ b/flang/include/flang/Runtime/io-api.h
@@ -14,6 +14,7 @@
 #include "flang/Common/uint128.h"
 #include "flang/Runtime/entry-names.h"
 #include "flang/Runtime/iostat.h"
+#include "flang/Runtime/magic-numbers.h"
 #include <cinttypes>
 #include <cstddef>
 
@@ -29,7 +30,9 @@ class IoStatementState;
 using Cookie = IoStatementState *;
 using ExternalUnit = int;
 using AsynchronousId = int;
-static constexpr ExternalUnit DefaultUnit{-1}; // READ(*), WRITE(*), PRINT
+
+static constexpr ExternalUnit DefaultOutputUnit{FORTRAN_DEFAULT_OUTPUT_UNIT};
+static constexpr ExternalUnit DefaultInputUnit{FORTRAN_DEFAULT_INPUT_UNIT};
 
 // INQUIRE specifiers are encoded as simple base-26 packings of
 // the spellings of their keywords.
@@ -57,7 +60,8 @@ extern "C" {
 
 // These functions initiate data transfer statements (READ, WRITE, PRINT).
 // Example: PRINT *, 666 is implemented as the series of calls:
-//   Cookie cookie{BeginExternalListOutput(DefaultUnit,__FILE__,__LINE__)};
+//   Cookie cookie{BeginExternalListOutput(DefaultOutputUnit,
+//                                         __FILE__, __LINE__)};
 //   OutputInteger32(cookie, 666);
 //   EndIoStatement(cookie);
 // Formatted I/O with explicit formats can supply the format as a
@@ -135,19 +139,21 @@ enum Iostat IONAME(CheckUnitNumberInRange128)(common::int128_t unit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // External synchronous I/O initiation
-Cookie IONAME(BeginExternalListOutput)(ExternalUnit = DefaultUnit,
+Cookie IONAME(BeginExternalListOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IONAME(BeginExternalListInput)(ExternalUnit = DefaultUnit,
+Cookie IONAME(BeginExternalListInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 Cookie IONAME(BeginExternalFormattedOutput)(const char *format, std::size_t,
-    const Descriptor *formatDescriptor = nullptr, ExternalUnit = DefaultUnit,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor *formatDescriptor = nullptr,
+    ExternalUnit = DefaultOutputUnit, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 Cookie IONAME(BeginExternalFormattedInput)(const char *format, std::size_t,
-    const Descriptor *formatDescriptor = nullptr, ExternalUnit = DefaultUnit,
-    const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IONAME(BeginUnformattedOutput)(ExternalUnit = DefaultUnit,
+    const Descriptor *formatDescriptor = nullptr,
+    ExternalUnit = DefaultInputUnit, const char *sourceFile = nullptr,
+    int sourceLine = 0);
+Cookie IONAME(BeginUnformattedOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IONAME(BeginUnformattedInput)(ExternalUnit = DefaultUnit,
+Cookie IONAME(BeginUnformattedInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // WAIT(ID=)
@@ -190,7 +196,7 @@ Cookie IONAME(BeginInquireIoLength)(
 // This call makes the runtime library defer those particular error/end
 // conditions to the EndIoStatement() call rather than terminating
 // the image.  E.g., for READ(*,*,END=666) A, B, (C(J),J=1,N)
-//   Cookie cookie{BeginExternalListInput(DefaultUnit,__FILE__,__LINE__)};
+//   Cookie cookie{BeginExternalListInput(DefaultInputUnit,__FILE__,__LINE__)};
 //   EnableHandlers(cookie, false, false, true /*END=*/, false);
 //   if (InputReal64(cookie, &A)) {
 //     if (InputReal64(cookie, &B)) {
diff --git a/flang/include/flang/Runtime/magic-numbers.h b/flang/include/flang/Runtime/magic-numbers.h
index d00d5027d4ed2..196b13ad3755b 100644
--- a/flang/include/flang/Runtime/magic-numbers.h
+++ b/flang/include/flang/Runtime/magic-numbers.h
@@ -27,6 +27,10 @@ start at 100 so as to never conflict with those codes.
 #ifndef FORTRAN_RUNTIME_MAGIC_NUMBERS_H_
 #define FORTRAN_RUNTIME_MAGIC_NUMBERS_H_
 
+#define FORTRAN_DEFAULT_OUTPUT_UNIT 6
+#define FORTRAN_DEFAULT_INPUT_UNIT 5
+#define FORTRAN_ERROR_UNIT 0
+
 #define FORTRAN_RUNTIME_IOSTAT_END (-1)
 #define FORTRAN_RUNTIME_IOSTAT_EOR (-2)
 #define FORTRAN_RUNTIME_IOSTAT_FLUSH (-3)
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index a60ca92a8733e..3933ebeb9b3cc 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -1850,24 +1850,25 @@ static mlir::Value genIOUnit(Fortran::lower::AbstractConverter &converter,
                              mlir::Location loc,
                              const Fortran::parser::IoUnit *iounit,
                              mlir::Type ty, ConditionSpecInfo &csi,
-                             Fortran::lower::StatementContext &stmtCtx) {
+                             Fortran::lower::StatementContext &stmtCtx,
+                             int defaultUnitNumber) {
   auto &builder = converter.getFirOpBuilder();
   if (iounit)
     if (auto *e = std::get_if<Fortran::parser::FileUnitNumber>(&iounit->u))
       return genIOUnitNumber(converter, loc, Fortran::semantics::GetExpr(*e),
                              ty, csi, stmtCtx);
   return builder.create<mlir::arith::ConstantOp>(
-      loc, builder.getIntegerAttr(ty, Fortran::runtime::io::DefaultUnit));
+      loc, builder.getIntegerAttr(ty, defaultUnitNumber));
 }
 
 template <typename A>
-static mlir::Value getIOUnit(Fortran::lower::AbstractConverter &converter,
-                             mlir::Location loc, const A &stmt, mlir::Type ty,
-                             ConditionSpecInfo &csi,
-                             Fortran::lower::StatementContext &stmtCtx) {
+static mlir::Value
+getIOUnit(Fortran::lower::AbstractConverter &converter, mlir::Location loc,
+          const A &stmt, mlir::Type ty, ConditionSpecInfo &csi,
+          Fortran::lower::StatementContext &stmtCtx, int defaultUnitNumber) {
   const Fortran::parser::IoUnit *iounit =
       stmt.iounit ? &*stmt.iounit : getIOControl<Fortran::parser::IoUnit>(stmt);
-  return genIOUnit(converter, loc, iounit, ty, csi, stmtCtx);
+  return genIOUnit(converter, loc, iounit, ty, csi, stmtCtx, defaultUnitNumber);
 }
 //===----------------------------------------------------------------------===//
 // Generators for each IO statement type.
@@ -2091,7 +2092,7 @@ getBeginDataTransferFunc(mlir::Location loc, fir::FirOpBuilder &builder,
 }
 
 /// Generate the arguments of a begin data transfer statement call.
-template <bool hasIOCtrl, typename A>
+template <bool hasIOCtrl, int defaultUnitNumber, typename A>
 void genBeginDataTransferCallArgs(
     llvm::SmallVectorImpl<mlir::Value> &ioArgs,
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
@@ -2149,14 +2150,14 @@ void genBeginDataTransferCallArgs(
         TODO(loc, "asynchronous");
       maybeGetFormatArgs();
       ioArgs.push_back(getIOUnit(converter, loc, stmt,
-                                 ioFuncTy.getInput(ioArgs.size()), csi,
-                                 stmtCtx));
+                                 ioFuncTy.getInput(ioArgs.size()), csi, stmtCtx,
+                                 defaultUnitNumber));
     }
   } else { // PRINT - maybe explicit format; default unit
     maybeGetFormatArgs();
     ioArgs.push_back(builder.create<mlir::arith::ConstantOp>(
         loc, builder.getIntegerAttr(ioFuncTy.getInput(ioArgs.size()),
-                                    Fortran::runtime::io::DefaultUnit)));
+                                    defaultUnitNumber)));
   }
   // File name and line number are always the last two arguments.
   ioArgs.push_back(
@@ -2193,7 +2194,9 @@ genDataTransferStmt(Fortran::lower::AbstractConverter &converter,
       loc, builder, isFormatted, isList || isNml, isInternal,
       isInternalWithDesc, isAsync);
   llvm::SmallVector<mlir::Value> ioArgs;
-  genBeginDataTransferCallArgs<hasIOCtrl>(
+  genBeginDataTransferCallArgs<
+      hasIOCtrl, isInput ? Fortran::runtime::io::DefaultInputUnit
+                         : Fortran::runtime::io::DefaultOutputUnit>(
       ioArgs, converter, loc, stmt, ioFunc.getFunctionType(), isFormatted,
       isList || isNml, isInternal, isAsync, descRef, csi, stmtCtx);
   mlir::Value cookie =
diff --git a/flang/module/iso_fortran_env.f90 b/flang/module/iso_fortran_env.f90
index 61d8a07e61133..6ee153592e1c6 100644
--- a/flang/module/iso_fortran_env.f90
+++ b/flang/module/iso_fortran_env.f90
@@ -9,7 +9,7 @@
 ! See Fortran 2018, clause 16.10.2
 ! TODO: These are placeholder values so that some tests can be run.
 
-include '../include/flang/Runtime/magic-numbers.h' ! IOSTAT values
+include '../include/flang/Runtime/magic-numbers.h'
 
 module iso_fortran_env
 
@@ -130,8 +130,9 @@ module iso_fortran_env
 
   integer, parameter :: current_team = -1, initial_team = -2, parent_team = -3
 
-  integer, parameter :: input_unit = 5, output_unit = 6
-  integer, parameter :: error_unit = 0
+  integer, parameter :: output_unit = FORTRAN_DEFAULT_OUTPUT_UNIT
+  integer, parameter :: input_unit = FORTRAN_DEFAULT_INPUT_UNIT
+  integer, parameter :: error_unit = FORTRAN_ERROR_UNIT
   integer, parameter :: iostat_end = FORTRAN_RUNTIME_IOSTAT_END
   integer, parameter :: iostat_eor = FORTRAN_RUNTIME_IOSTAT_EOR
   integer, parameter :: iostat_inquire_internal_unit = &
diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index 9a69a26246412..79d43c7cc884f 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -185,9 +185,6 @@ template <Direction DIR, template <Direction> class STATE, typename... A>
 Cookie BeginExternalListIO(
     int unitNumber, const char *sourceFile, int sourceLine, A &&...xs) {
   Terminator terminator{sourceFile, sourceLine};
-  if (unitNumber == DefaultUnit) {
-    unitNumber = DIR == Direction::Input ? 5 : 6;
-  }
   Cookie errorCookie{nullptr};
   ExternalFileUnit *unit{GetOrCreateUnit(
       unitNumber, DIR, false /*!unformatted*/, terminator, errorCookie)};
@@ -246,9 +243,6 @@ Cookie BeginExternalFormattedIO(const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, ExternalUnit unitNumber,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
-  if (unitNumber == DefaultUnit) {
-    unitNumber = DIR == Direction::Input ? 5 : 6;
-  }
   Cookie errorCookie{nullptr};
   ExternalFileUnit *unit{GetOrCreateUnit(
       unitNumber, DIR, false /*!unformatted*/, terminator, errorCookie)};
@@ -761,7 +755,8 @@ bool IONAME(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetAccess() called when not in an OPEN statement");
     }
@@ -796,7 +791,8 @@ bool IONAME(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetAction() called when not in an OPEN statement");
     }
@@ -852,7 +848,8 @@ bool IONAME(SetAsynchronous)(
         handler.SignalError(IostatBadAsynchronous);
       }
     }
-  } else if (!io.get_if<ErroneousIoStatementState>()) {
+  } else if (!io.get_if<NoopStatementState>() &&
+      !io.get_if<ErroneousIoStatementState>()) {
     handler.Crash("SetAsynchronous() called when not in an OPEN or external "
                   "I/O statement");
   }
@@ -864,7 +861,8 @@ bool IONAME(SetCarriagecontrol)(
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetCarriageControl() called when not in an OPEN statement");
     }
@@ -895,7 +893,8 @@ bool IONAME(SetConvert)(
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetConvert() called when not in an OPEN statement");
     }
@@ -919,7 +918,8 @@ bool IONAME(SetEncoding)(
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetEncoding() called when not in an OPEN statement");
     }
@@ -949,7 +949,8 @@ bool IONAME(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetForm() called when not in an OPEN statement");
     }
@@ -977,7 +978,8 @@ bool IONAME(SetPosition)(
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetPosition() called when not in an OPEN statement");
     }
@@ -1008,7 +1010,8 @@ bool IONAME(SetRecl)(Cookie cookie, std::size_t n) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetRecl() called when not in an OPEN statement");
     }
@@ -1093,7 +1096,8 @@ bool IONAME(SetFile)(Cookie cookie, const char *path, std::size_t chars) {
     }
     open->set_path(path, chars);
     return true;
-  } else if (!io.get_if<ErroneousIoStatementState>()) {
+  } else if (!io.get_if<NoopStatementState>() &&
+      !io.get_if<ErroneousIoStatementState>()) {
     io.GetIoErrorHandler().Crash(
         "SetFile() called when not in an OPEN statement");
   }
@@ -1104,7 +1108,8 @@ bool IONAME(GetNewUnit)(Cookie cookie, int &unit, int kind) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "GetNewUnit() called when not in an OPEN statement");
     }
@@ -1361,7 +1366,8 @@ std::size_t IONAME(GetSize)(Cookie cookie) {
   if (const auto *formatted{
           io.get_if<FormattedIoStatementState<Direction::Input>>()}) {
     return formatted->GetEditDescriptorChars();
-  } else if (!io.get_if<ErroneousIoStatementState>()) {
+  } else if (!io.get_if<NoopStatementState>() &&
+      !io.get_if<ErroneousIoStatementState>()) {
     handler.Crash("GetIoSize() called for an I/O statement that is not a "
                   "formatted READ()");
   }
@@ -1376,7 +1382,8 @@ std::size_t IONAME(GetIoLength)(Cookie cookie) {
   }
   if (const auto *inq{io.get_if<InquireIOLengthState>()}) {
     return inq->bytes();
-  } else if (!io.get_if<ErroneousIoStatementState>()) {
+  } else if (!io.get_if<NoopStatementState>() &&
+      !io.get_if<ErroneousIoStatementState>()) {
     handler.Crash("GetIoLength() called for an I/O statement that is not "
                   "INQUIRE(IOLENGTH=)");
   }
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index e4f346ae941f3..18590567c65eb 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -11,6 +11,7 @@
 #include "lock.h"
 #include "tools.h"
 #include "unit-map.h"
+#include "flang/Runtime/magic-numbers.h"
 #include <cstdio>
 #include <limits>
 #include <utility>
@@ -220,21 +221,24 @@ UnitMap &ExternalFileUnit::CreateUnitMap() {
   UnitMap &newUnitMap{*New<UnitMap>{terminator}().release()};
 
   bool wasExtant{false};
-  ExternalFileUnit &out{*newUnitMap.LookUpOrCreate(6, terminator, wasExtant)};
+  ExternalFileUnit &out{*newUnitMap.LookUpOrCreate(
+      FORTRAN_DEFAULT_OUTPUT_UNIT, terminator, wasExtant)};
   RUNTIME_CHECK(terminator, !wasExtant);
   out.Predefine(1);
   handler.SignalError(out.SetDirection(Direction::Output));
   out.isUnformatted = false;
   defaultOutput = &out;
 
-  ExternalFileUnit &in{*newUnitMap.LookUpOrCreate(5, terminator, wasExtant)};
+  ExternalFileUnit &in{*newUnitMap.LookUpOrCreate(
+      FORTRAN_DEFAULT_INPUT_UNIT, terminator, wasExtant)};
   RUNTIME_CHECK(terminator, !wasExtant);
   in.Predefine(0);
   handler.SignalError(in.SetDirection(Direction::Input));
   in.isUnformatted = false;
   defaultInput = &in;
 
-  ExternalFileUnit &error{*newUnitMap.LookUpOrCreate(0, terminator, wasExtant)};
+  ExternalFileUnit &error{
+      *newUnitMap.LookUpOrCreate(FORTRAN_ERROR_UNIT, terminator, wasExtant)};
   RUNTIME_CHECK(terminator, !wasExtant);
   error.Predefine(2);
   handler.SignalError(error.SetDirection(Direction::Output));
diff --git a/flang/test/Lower/HLFIR/calls-f77.f90 b/flang/test/Lower/HLFIR/calls-f77.f90
index 09eebaae76060..ac5be007eb838 100644
--- a/flang/test/Lower/HLFIR/calls-f77.f90
+++ b/flang/test/Lower/HLFIR/calls-f77.f90
@@ -156,7 +156,7 @@ subroutine return_char(n)
 end subroutine
 ! CHECK-LABEL: func.func @_QPreturn_char(
 ! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}n
-! CHECK:  %[[VAL_2:.*]] = arith.constant -1 : i32
+! CHECK:  %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i64>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> index
 ! CHECK:  %[[VAL_9:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
index cb9dd2fa7956f..b943cd3225a56 100644
--- a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
+++ b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
@@ -8,7 +8,7 @@ end subroutine test_int_allocatable
 ! CHECK-LABEL:   func.func @_QPtest_int_allocatable(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>> {fir.bindc_name = "a"}) {
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_int_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-! CHECK:           %[[VAL_2:.*]] = arith.constant -1 : i32
+! CHECK:           %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant {{[0-9]*}} : i32
@@ -28,7 +28,7 @@ end subroutine test_int_pointer
 ! CHECK-LABEL:   func.func @_QPtest_int_pointer(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "p"}) {
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_int_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
-! CHECK:           %[[VAL_2:.*]] = arith.constant -1 : i32
+! CHECK:           %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant {{[0-9]*}} : i32
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
index 148a7ee31f08d..2060e2062c1a3 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
@@ -13,11 +13,11 @@
 
 ! Check that we are accessing the clone inside the loop
 !CHECK-DAG: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
-!CHECK-DAG: %[[NEG_ONE:.*]] = arith.constant -1 : i32
+!CHECK-DAG: %[[UNIT:.*]] = arith.constant 6 : i32
 !CHECK-NEXT: %[[ADDR:.*]] = fir.address_of(@_QQclX
 !CHECK-NEXT: %[[CVT0:.*]] = fir.convert %[[ADDR]] 
 !CHECK-NEXT: %[[CNST:.*]] = arith.constant
-!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[NEG_ONE]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[UNIT]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 !CHECK-NEXT: %[[CVT_0_1:.*]] = fir.convert %[[ARG1_PVT]] 
 !CHECK-NEXT: %[[CVT_0_2:.*]] = fir.convert %[[FIVE]]
 !CHECK-NEXT: %[[CALL_OP_ASCII:.*]] = fir.call @_FortranAioOutputAscii(%[[CALL_BEGIN_IO]], %[[CVT_0_1]], %[[CVT_0_2]])
diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
index e6ea5b5c4051a..28f59c95d60bb 100644
--- a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
@@ -15,11 +15,11 @@
 
 ! Check that we are accessing the clone inside the loop
 !CHECK-DAG: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
-!CHECK-DAG: %[[NEG_ONE:.*]] = arith.constant -1 : i32
+!CHECK-DAG: %[[UNIT:.*]] = arith.constant 6 : i32
 !CHECK-NEXT: %[[ADDR:.*]] = fir.address_of(@_QQclX
 !CHECK-NEXT: %[[CVT0:.*]] = fir.convert %[[ADDR]] 
 !CHECK-NEXT: %[[CNST:.*]] = arith.constant
-!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[NEG_ONE]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[UNIT]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 !CHECK-NEXT: %[[CVT_0_1:.*]] = fir.convert %[[ARG1_PVT_DECL]]#1 
 !CHECK-NEXT: %[[CVT_0_2:.*]] = fir.convert %[[FIVE]]
 !CHECK-NEXT: %[[CALL_OP_ASCII:.*]] = fir.call @_FortranAioOutputAscii(%[[CALL_BEGIN_IO]], %[[CVT_0_1]], %[[CVT_0_2]])
diff --git a/flang/test/Lower/array-character.f90 b/flang/test/Lower/array-character.f90
index ee01589f802ec..c93ef4be30823 100644
--- a/flang/test/Lower/array-character.f90
+++ b/flang/test/Lower/array-character.f90
@@ -57,7 +57,7 @@ subroutine issue(c1, c2)
 program p
   ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant 4 : index
   ! CHECK-DAG: %[[VAL_1:.*]] = arith.constant 3 : index
-  ! CHECK-DAG: %[[VAL_2:.*]] = arith.constant -1 : i32
+  ! CHECK-DAG: %[[VAL_2:.*]] = arith.constant 6 : i32
   ! CHECK: %[[VAL_5:.*]] = fir.alloca !fir.array<3x!fir.char<1,4>> {bindc_name = "c1", uniq_name = "_QFEc1"}
   ! CHECK: %[[VAL_6:.*]] = fir.address_of(@_QFEc2) : !fir.ref<!fir.array<3x!fir.char<1,4>>>
   ! CHECK: %[[VAL_7:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,
@@ -91,7 +91,7 @@ end program p
 
 ! CHECK-LABEL: func @_QPcharlit() {
 subroutine charlit
-  ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant -1 : i32
+  ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant 6 : i32
   ! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 3 : index
   ! CHECK-DAG: %[[VAL_4:.*]] = arith.constant false
   ! CHECK-DAG: %[[VAL_5:.*]] = arith.constant 4 : index
diff --git a/flang/test/Lower/array-expression-slice-1.f90 b/flang/test/Lower/array-expression-slice-1.f90
index d81b8488205e0..1524509024329 100644
--- a/flang/test/Lower/array-expression-slice-1.f90
+++ b/flang/test/Lower/array-expression-slice-1.f90
@@ -10,7 +10,7 @@
 ! CHECK-DAG:         %[[VAL_13:.*]] = arith.constant 2 : i64
 ! CHECK-DAG:         %[[VAL_14:.*]] = arith.constant 7 : i64
 ! CHECK-DAG:         %[[VAL_16:.*]] = arith.constant 4 : i64
-! CHECK-DAG:         %[[VAL_18:.*]] = arith.constant -1 : i32
+! CHECK-DAG:         %[[VAL_18:.*]] = arith.constant 6 : i32
 ! CHECK-DAG:         %[[VAL_19:.*]] = arith.constant 0 : i64
 ! CHECK-DAG:         %[[VAL_20:.*]] = arith.constant 1 : i64
 ! CHECK-DAG:         %[[VAL_21:.*]] = arith.constant 3 : i64
@@ -385,7 +385,7 @@ end program p
 ! CHECK-DAG:     %[[VAL_2:.*]] = arith.constant 2 : index
 ! CHECK-DAG:     %[[VAL_3:.*]] = arith.constant 1 : index
 ! CHECK-DAG:     %[[VAL_4:.*]] = arith.constant 4 : index
-! CHECK-DAG:     %[[VAL_6:.*]] = arith.constant -1 : i32
+! CHECK-DAG:     %[[VAL_6:.*]] = arith.constant 6 : i32
 ! CHECK-DAG:     %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:         %[[VAL_8:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:         %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1>>>
diff --git a/flang/test/Lower/array-expression.f90 b/flang/test/Lower/array-expression.f90
index f73cd6e5f4f17..75789cd6952ab 100644
--- a/flang/test/Lower/array-expression.f90
+++ b/flang/test/Lower/array-expression.f90
@@ -1113,7 +1113,7 @@ end subroutine test19h
 ! CHECK:         %[[VAL_6:.*]] = fir.convert %[[VAL_5]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,?>>>
 ! CHECK:         %[[VAL_7:.*]] = arith.constant 2 : index
 ! CHECK:         %[[VAL_8:.*]] = arith.constant 10 : index
-! CHECK:         %[[VAL_9:.*]] = arith.constant -1 : i32
+! CHECK:         %[[VAL_9:.*]] = arith.constant 6 : i32
 ! CHECK:         %[[VAL_10:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,
 ! CHECK:         %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 ! CHECK:         %[[VAL_12:.*]] = arith.constant {{.*}} : i32
diff --git a/flang/test/Lower/array-temp.f90 b/flang/test/Lower/array-temp.f90
index f8c2ec3e03c54..971b3506fbe34 100644
--- a/flang/test/Lower/array-temp.f90
+++ b/flang/test/Lower/array-temp.f90
@@ -47,7 +47,7 @@ subroutine ss4(N)
 ! CHECK:   %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
 ! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
 ! CHECK:   %[[C_27_i32:[-0-9a-z_]+]] = arith.constant 27 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK:   %[[C_6_i32:[-0-9a-z_]+]] = arith.constant 6 : i32
 ! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
 ! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
 ! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
@@ -118,7 +118,7 @@ subroutine ss4(N)
 ! CHECK:   cf.br ^bb9(%[[V_42]], %[[V_46:[0-9]+]] : index, index)
 ! CHECK: ^bb11:  // pred: ^bb9
 ! CHECK:   fir.freemem %[[V_18:[0-9]+]] : !fir.heap<!fir.array<?xf32>>
-! CHECK:   %[[V_49:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_m1_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+! CHECK:   %[[V_49:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_6_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 ! CHECK:   %[[V_50:[0-9]+]] = fir.slice %[[C_1]], %[[C_2]], %[[C_1]] : (index, index, index) -> !fir.slice<1>
 ! CHECK:   %[[V_51:[0-9]+]] = fir.embox %[[V_4]](%[[V_5]]) [%[[V_50]]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<2xf32>>
 ! CHECK:   %[[V_52:[0-9]+]] = fir.convert %[[V_51:[0-9]+]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
@@ -141,7 +141,7 @@ subroutine ss4(N)
 ! CHECK:   %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
 ! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
 ! CHECK:   %[[C_34_i32:[-0-9a-z_]+]] = arith.constant 34 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK:   %[[C_6_i32:[-0-9a-z_]+]] = arith.constant 6 : i32
 ! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
 ! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
 ! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
@@ -244,7 +244,7 @@ subroutine ss4(N)
 ! CHECK:   cf.br ^bb15(%[[V_69]], %[[V_70:[0-9]+]] : index, index)
 ! CHECK: ^bb19:  // pred: ^bb15
 ! CHECK:   fir.freemem %[[V_24:[0-9]+]] : !fir.heap<!fir.array<2x?xf32>>
-! CHECK:   %[[V_73:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_m1_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+! CHECK:   %[[V_73:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_6_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 ! CHECK:   %[[V_74:[0-9]+]] = fir.slice %[[C_1]], %[[C_2]], %[[C_1]], %[[C_1]], %[[C_2]], %[[C_1]] : (index, index, index, index, index, index) -> !fir.slice<2>
 ! CHECK:   %[[V_75:[0-9]+]] = fir.embox %[[V_4]](%[[V_5]]) [%[[V_74]]] : (!fir.ref<!fir.array<2x?xf32>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<?x2xf32>>
 ! CHECK:   %[[V_76:[0-9]+]] = fir.convert %[[V_75:[0-9]+]] : (!fir.box<!fir.array<?x2xf32>>) -> !fir.box<none>
@@ -267,7 +267,7 @@ subroutine ss4(N)
 ! CHECK:   %[[C_m1:[-0-9a-z_]+]] = arith.constant -1 : index
 ! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
 ! CHECK:   %[[C_41_i32:[-0-9a-z_]+]] = arith.constant 41 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK:   %[[C_6_i32:[-0-9a-z_]+]] = arith.constant 6 : i32
 ! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
 ! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
 ! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
@@ -370,7 +370,7 @@ subroutine ss4(N)
 ! CHECK:   cf.br ^bb15(%[[V_69]], %[[V_70:[0-9]+]] : index, index)
 ! CHECK: ^bb19:  // pred: ^bb15
 ! CHECK:   fir.freemem %[[V_24:[0-9]+]] : !fir.heap<!fir.array<?x2xf32>>
-! CHECK:   %[[V_73:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_m1_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+! CHECK:   %[[V_73:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_6_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 ! CHECK:   %[[V_74:[0-9]+]] = fir.slice %[[C_1]], %[[C_2]], %[[C_1]], %[[C_1]], %[[C_2]], %[[C_1]] : (index, index, index, index, index, index) -> !fir.slice<2>
 ! CHECK:   %[[V_75:[0-9]+]] = fir.embox %[[V_4]](%[[V_5]]) [%[[V_74]]] : (!fir.ref<!fir.array<?x2xf32>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<2x?xf32>>
 ! CHECK:   %[[V_76:[0-9]+]] = fir.convert %[[V_75:[0-9]+]] : (!fir.box<!fir.array<2x?xf32>>) -> !fir.box<none>
diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90
index 073493d7fe28a..25e637805e872 100644
--- a/flang/test/Lower/host-associated.f90
+++ b/flang/test/Lower/host-associated.f90
@@ -478,7 +478,7 @@ end subroutine test_proc_dummy_other
 ! CHECK-DAG:         %[[VAL_3:.*]] = arith.constant false
 ! CHECK-DAG:         %[[VAL_4:.*]] = arith.constant 1 : index
 ! CHECK-DAG:         %[[VAL_5:.*]] = arith.constant 32 : i8
-! CHECK-DAG:         %[[VAL_6:.*]] = arith.constant -1 : i32
+! CHECK-DAG:         %[[VAL_6:.*]] = arith.constant 6 : i32
 ! CHECK-DAG:         %[[VAL_8:.*]] = arith.constant 10 : i64
 ! CHECK-DAG:         %[[VAL_9:.*]] = arith.constant 40 : index
 ! CHECK-DAG:         %[[VAL_10:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/io-statement-2.f90 b/flang/test/Lower/io-statement-2.f90
index 108c64344f30c..dcf7387f8a22d 100644
--- a/flang/test/Lower/io-statement-2.f90
+++ b/flang/test/Lower/io-statement-2.f90
@@ -159,11 +159,11 @@ subroutine loopnest
 
 ! CHECK-LABEL: func @_QPimpliedformat
 subroutine impliedformat
-  ! CHECK: BeginExternalListInput(%c-1
+  ! CHECK: BeginExternalListInput
   ! CHECK: InputReal32
   ! CHECK: EndIoStatement(%3) {{.*}}: (!fir.ref<i8>) -> i32
   read*, x
-  ! CHECK: BeginExternalListOutput(%c-1
+  ! CHECK: BeginExternalListOutput
   ! CHECK: OutputReal32
   ! CHECK: EndIoStatement
   print*, x
diff --git a/flang/test/Lower/vector-subscript-io.f90 b/flang/test/Lower/vector-subscript-io.f90
index 9030bebd4ac4a..d298609269dae 100644
--- a/flang/test/Lower/vector-subscript-io.f90
+++ b/flang/test/Lower/vector-subscript-io.f90
@@ -9,7 +9,7 @@ subroutine simple(x, y)
   integer :: x(10)
   read(*,*) x(y)
 ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant 10 : index
-! CHECK-DAG: %[[VAL_1:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_1:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 4 : i32
 ! CHECK-DAG: %[[VAL_4:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index
@@ -51,7 +51,7 @@ integer function get_substcript()
   real :: x(:, :)
   ! Test subscripts are only evaluated once.
   read(*,*) x(get_substcript(), get_vector())
-! CHECK-DAG: %[[VAL_26:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_26:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_28:.*]] = arith.constant 0 : i64
 ! CHECK-DAG: %[[VAL_29:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_30:.*]] = arith.constant 1 : index
@@ -102,7 +102,7 @@ subroutine with_assumed_shapes(x, y)
   integer :: y(:)
   integer :: x(:)
   read(*,*) x(y)
-! CHECK-DAG: %[[VAL_60:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_60:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_62:.*]] = arith.constant 4 : i32
 ! CHECK-DAG: %[[VAL_63:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_64:.*]] = arith.constant 1 : index
@@ -138,7 +138,7 @@ subroutine lower_bounds(x, y)
   read(*,*) x(3, y)
 ! CHECK-DAG: %[[VAL_84:.*]] = arith.constant 4 : index
 ! CHECK-DAG: %[[VAL_85:.*]] = arith.constant 6 : index
-! CHECK-DAG: %[[VAL_86:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_86:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_88:.*]] = arith.constant 3 : i64
 ! CHECK-DAG: %[[VAL_89:.*]] = arith.constant 2 : index
 ! CHECK-DAG: %[[VAL_90:.*]] = arith.constant 4 : i32
@@ -177,7 +177,7 @@ subroutine two_vectors(x, y1, y2)
   real :: x(4, 4)
   read(*,*) x(y1, y2)
 ! CHECK-DAG: %[[VAL_114:.*]] = arith.constant 4 : index
-! CHECK-DAG: %[[VAL_115:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_115:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_117:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_118:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_119:.*]] = arith.constant 1 : index
@@ -220,7 +220,7 @@ subroutine triplets_and_vector(x, y)
   integer :: y(3)
   complex :: x(4, 4)
   read(*,*) x(1:4:2, y)
-! CHECK-DAG: %[[VAL_147:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_147:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_149:.*]] = arith.constant 4 : index
 ! CHECK-DAG: %[[VAL_150:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_151:.*]] = arith.constant 2 : index
@@ -264,7 +264,7 @@ subroutine simple_char(x, y)
   character(*) :: x(3:8)
   read(*,*) x(y)
 ! CHECK-DAG: %[[VAL_178:.*]] = arith.constant 6 : index
-! CHECK-DAG: %[[VAL_179:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_179:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_181:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_182:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_183:.*]] = arith.constant 1 : index
@@ -301,7 +301,7 @@ subroutine substring(x, y, i, j)
   integer :: y(3), i, j
   character(*) :: x(:)
   read(*,*) x(y)(i:j)
-! CHECK-DAG: %[[VAL_206:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_206:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_208:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_209:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_210:.*]] = arith.constant 1 : index
@@ -347,7 +347,7 @@ subroutine complex_part(z, y)
   integer :: y(:)
   complex :: z(:)
   read(*,*) z(y)%IM
-! CHECK-DAG: %[[VAL_244:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_244:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_246:.*]] = arith.constant 1 : i32
 ! CHECK-DAG: %[[VAL_247:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_248:.*]] = arith.constant 1 : index
@@ -392,7 +392,7 @@ subroutine simple_derived(x, y)
   type(t) :: x(3:8)
   read(*,*) x(y)
 ! CHECK-DAG: %[[VAL_267:.*]] = arith.constant 6 : index
-! CHECK-DAG: %[[VAL_268:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_268:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_270:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_271:.*]] = arith.constant 4 : index
 ! CHECK-DAG: %[[VAL_272:.*]] = arith.constant 0 : index
@@ -430,7 +430,7 @@ subroutine with_path(b, i)
   integer :: i(:)
   read (*, *) b(5, i, 8:9:1)%a(4,5)%i
 ! CHECK-DAG: %[[VAL_294:.*]] = arith.constant 4 : index
-! CHECK-DAG: %[[VAL_295:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_295:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_297:.*]] = arith.constant 8 : index
 ! CHECK-DAG: %[[VAL_298:.*]] = arith.constant 9 : index
 ! CHECK-DAG: %[[VAL_299:.*]] = arith.constant 4 : i64
@@ -481,7 +481,7 @@ subroutine simple_iostat(x, y, j, stat)
   integer :: j, y(:), stat
   real :: x(:)
   read(*, *, iostat=stat) x(y), j
-! CHECK-DAG: %[[VAL_334:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_334:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_336:.*]] = arith.constant false
 ! CHECK-DAG: %[[VAL_337:.*]] = arith.constant true
 ! CHECK-DAG: %[[VAL_338:.*]] = arith.constant 1 : index
@@ -527,7 +527,7 @@ subroutine iostat_in_io_loop(k, j, stat)
   integer  :: stat
   read(*, *, iostat=stat) (k(i, j), i=1,3,1)
 ! CHECK-DAG: %[[VAL_365:.*]] = arith.constant 5 : index
-! CHECK-DAG: %[[VAL_366:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_366:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_368:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_369:.*]] = arith.constant true
 ! CHECK-DAG: %[[VAL_370:.*]] = arith.constant false

From 8fa3184539df441ca325d8b70ae5b573c46d8450 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Tue, 2 Jan 2024 09:47:38 -0800
Subject: [PATCH 052/313] [TextAPI] Use function_ref where possible, NFCI
 (#76732)

---
 llvm/lib/TextAPI/TextStubV5.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/TextAPI/TextStubV5.cpp b/llvm/lib/TextAPI/TextStubV5.cpp
index 2f82bc03480b9..aea772dbc4be3 100644
--- a/llvm/lib/TextAPI/TextStubV5.cpp
+++ b/llvm/lib/TextAPI/TextStubV5.cpp
@@ -201,8 +201,9 @@ Expected<StubT> getRequiredValue(
 template <typename JsonT, typename StubT = JsonT>
 Expected<StubT> getRequiredValue(
     TBDKey Key, const Object *Obj,
-    std::function<std::optional<JsonT>(const Object *, StringRef)> GetValue,
-    StubT DefaultValue, std::function<std::optional<StubT>(JsonT)> Validate) {
+    std::function<std::optional<JsonT>(const Object *, StringRef)> const
+        GetValue,
+    StubT DefaultValue, function_ref<std::optional<StubT>(JsonT)> Validate) {
   std::optional<JsonT> Val = GetValue(Obj, Keys[Key]);
   if (!Val)
     return DefaultValue;
@@ -215,7 +216,7 @@ Expected<StubT> getRequiredValue(
 }
 
 Error collectFromArray(TBDKey Key, const Object *Obj,
-                       std::function<void(StringRef)> Append,
+                       function_ref<void(StringRef)> Append,
                        bool IsRequired = false) {
   const auto *Values = Obj->getArray(Keys[Key]);
   if (!Values) {

From 9fd03cb6522ac1469512502713bedf8b352e2589 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 2 Jan 2024 09:49:50 -0800
Subject: [PATCH 053/313] [flang][runtime] Don't prematurely end formatted
 integer input (#76643)

When an input data-list has more items than can be read by a format from
the input record (e.g., "(4I5)" reading "1 2"), don't return false from
EditIntegerInput() just because nothing was read -- that will prevent
later items from being set to zero, as they should be. Return true
unless nothing was read and there is some kind of error pending.

Fixes llvm-error-tests/Fortran/gfortran/regression/pr478478.f90.
---
 flang/runtime/edit-input.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 0c2341a4dfac7..6d4fa588cbf60 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -244,7 +244,7 @@ bool EditIntegerInput(
   if (sign == '-') {
     value = -value;
   }
-  if (any || !io.GetConnectionState().IsAtEOF()) {
+  if (any || !io.GetIoErrorHandler().InError()) {
     // The value is stored in the lower order bits on big endian platform.
     // When memcpy, shift the value to the higher order bit.
     auto shft{static_cast<int>(sizeof(value.low())) - kind};
@@ -255,8 +255,10 @@ bool EditIntegerInput(
     } else {
       std::memcpy(n, &value, kind); // a blank field means zero
     }
+    return true;
+  } else {
+    return false;
   }
-  return any;
 }
 
 // Parses a REAL input number from the input source as a normalized

From bf684a97f37b12ccf2c98b007b8222c54c7480f5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 2 Jan 2024 09:50:06 -0800
Subject: [PATCH 054/313] [RISCV] Don't emit vxrm writes for vnclip(u).wi with
 shift of 0. (#76578)

If there's no shift being performed, the rounding mode doesn't matter.

We could do the same for vssra and vssrl, but they are no-ops with a
shift of 0 so would be better off being removed earlier.
---
 .../lib/Target/RISCV/RISCVInsertWriteVXRM.cpp | 14 +++++++++--
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     | 24 -------------------
 llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll | 15 ------------
 3 files changed, 12 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
index de2227f821923..e487cc8b2e20c 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
@@ -198,13 +198,23 @@ char RISCVInsertWriteVXRM::ID = 0;
 INITIALIZE_PASS(RISCVInsertWriteVXRM, DEBUG_TYPE, RISCV_INSERT_WRITE_VXRM_NAME,
                 false, false)
 
+static bool ignoresVXRM(const MachineInstr &MI) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+  default:
+    return false;
+  case RISCV::VNCLIP_WI:
+  case RISCV::VNCLIPU_WI:
+    return MI.getOperand(3).getImm() == 0;
+  }
+}
+
 bool RISCVInsertWriteVXRM::computeVXRMChanges(const MachineBasicBlock &MBB) {
   BlockData &BBInfo = BlockInfo[MBB.getNumber()];
 
   bool NeedVXRMWrite = false;
   for (const MachineInstr &MI : MBB) {
     int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc());
-    if (VXRMIdx >= 0) {
+    if (VXRMIdx >= 0 && !ignoresVXRM(MI)) {
       unsigned NewVXRMImm = MI.getOperand(VXRMIdx).getImm();
 
       if (!BBInfo.VXRMUse.isValid())
@@ -356,7 +366,7 @@ void RISCVInsertWriteVXRM::emitWriteVXRM(MachineBasicBlock &MBB) {
 
   for (MachineInstr &MI : MBB) {
     int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc());
-    if (VXRMIdx >= 0) {
+    if (VXRMIdx >= 0 && !ignoresVXRM(MI)) {
       unsigned NewVXRMImm = MI.getOperand(VXRMIdx).getImm();
 
       if (PendingInsert || !Info.isStatic() ||
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index e1ebf2afda657..f1a82b9e427e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -40,7 +40,6 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -77,7 +76,6 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -193,7 +191,6 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -247,7 +244,6 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -497,7 +493,6 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 2
@@ -666,7 +661,6 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 2
@@ -907,7 +901,6 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vfncvt.rtz.x.f.w v9, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v9, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -944,7 +937,6 @@ define <2 x i16> @utest_f64i16(<2 x double> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vfncvt.rtz.xu.f.w v9, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v9, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -1063,7 +1055,6 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -1118,7 +1109,6 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -1495,7 +1485,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -1774,7 +1763,6 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -3349,7 +3337,6 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -3384,7 +3371,6 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -3497,7 +3483,6 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -3549,7 +3534,6 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -3796,7 +3780,6 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 2
@@ -3963,7 +3946,6 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 2
@@ -4201,7 +4183,6 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vfncvt.rtz.x.f.w v9, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v9, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -4236,7 +4217,6 @@ define <2 x i16> @utest_f64i16_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vfncvt.rtz.xu.f.w v9, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v9, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -4352,7 +4332,6 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -4405,7 +4384,6 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) {
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
@@ -4779,7 +4757,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -5054,7 +5031,6 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    csrwi vxrm, 0
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll
index e12c9e515a9fd..42577408f71b0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll
@@ -20,7 +20,6 @@ define void @trunc_sat_i8i16_maxmin(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -37,7 +36,6 @@ define void @trunc_sat_i8i16_minmax(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -75,7 +73,6 @@ define void @trunc_sat_u8u16_min(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -109,7 +106,6 @@ define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -126,7 +122,6 @@ define void @trunc_sat_u8u16_minmax(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -166,7 +161,6 @@ define void @trunc_sat_i16i32_maxmin(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -183,7 +177,6 @@ define void @trunc_sat_i16i32_minmax(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -219,7 +212,6 @@ define void @trunc_sat_u16u32_min(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -235,7 +227,6 @@ define void @trunc_sat_u16u32_minmax(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -252,7 +243,6 @@ define void @trunc_sat_u16u32_maxmin(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -293,7 +283,6 @@ define void @trunc_sat_i32i64_maxmin(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclip.wi v10, v8, 0
 ; CHECK-NEXT:    vse32.v v10, (a1)
 ; CHECK-NEXT:    ret
@@ -310,7 +299,6 @@ define void @trunc_sat_i32i64_minmax(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclip.wi v10, v8, 0
 ; CHECK-NEXT:    vse32.v v10, (a1)
 ; CHECK-NEXT:    ret
@@ -347,7 +335,6 @@ define void @trunc_sat_u32u64_min(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v10, v8, 0
 ; CHECK-NEXT:    vse32.v v10, (a1)
 ; CHECK-NEXT:    ret
@@ -364,7 +351,6 @@ define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v10, v8, 0
 ; CHECK-NEXT:    vse32.v v10, (a1)
 ; CHECK-NEXT:    ret
@@ -381,7 +367,6 @@ define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vnclipu.wi v10, v8, 0
 ; CHECK-NEXT:    vse32.v v10, (a1)
 ; CHECK-NEXT:    ret

From 9c978c94187511326627c34fb04c57f853c488fc Mon Sep 17 00:00:00 2001
From: Wei Wang <apollo.mobility@gmail.com>
Date: Tue, 2 Jan 2024 09:54:16 -0800
Subject: [PATCH 055/313] [coroutines] Use DILocation from new storage for
 hoisted dbg.declare (#75402)

Make the hoisted dbg.declare inherent the DILocation scope from the new
storage.

After hoisting, the dbg.declare is moved into the block that defines the
new storage. This could create an inconsistency in the debug location
scope hierarchy where the scope of hoisted dbg.declare (i.e.
DILexicalBlock) is enclosed with the scope of the block (i.e.
DISubprogram). This confuses LiveDebugValues pass to think that the
hoisted dbg.declare is killed in that block and does not generate
DBG_VALUE in other blocks. Debugger won't be able to track its value
anymore.

We do this for unoptimized binary only.
---
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  |  6 +-
 .../coro-debug-frame-variable-O1.ll           | 64 +++++++++++++++++++
 .../Coroutines/coro-debug-frame-variable.ll   | 29 +++++----
 3 files changed, 84 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll

diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index f37b4dc938d30..529f7309a1a27 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -2951,9 +2951,11 @@ void coro::salvageDebugInfo(
   // dbg.declare does.
   if (isa<DbgDeclareInst>(DVI)) {
     std::optional<BasicBlock::iterator> InsertPt;
-    if (auto *I = dyn_cast<Instruction>(Storage))
+    if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
-    else if (isa<Argument>(Storage))
+      if (!OptimizeFrame && I->getDebugLoc())
+        DVI.setDebugLoc(I->getDebugLoc());
+    } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
     if (InsertPt)
       DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt);
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll
new file mode 100644
index 0000000000000..acd6a08d7c1b8
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s -passes='module(coro-early),cgscc(inline,coro-split<reuse-storage>)' -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators < %s -passes='module(coro-early),cgscc(inline,coro-split<reuse-storage>)' -S | FileCheck %s
+
+; Simplified version from pr#75104.
+; Make sure we do not update debug location for hosited dbg.declare intrinsics when optimizing coro frame.
+
+; CHECK-NOT: mismatched subprogram between llvm.dbg.declare variable and !dbg attachment
+
+%"struct.std::coroutine_handle" = type { i8 }
+
+define void @_Z1fv() presplitcoroutine {
+entry:
+  %0 = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %1 = call ptr @llvm.coro.begin(token %0, ptr null), !dbg !10
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  call void @_ZN1BD1Ev()
+  %2 = call token @llvm.coro.save(ptr null)
+  %3 = call i8 @llvm.coro.suspend(token none, i1 false)
+  br label %for.cond
+}
+
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
+declare ptr @llvm.coro.begin(token, ptr writeonly)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare token @llvm.coro.save(ptr)
+declare i8 @llvm.coro.suspend(token, i1)
+
+define void @_ZN1BD1Ev() {
+entry:
+  %b11 = alloca [0 x [0 x %"struct.std::coroutine_handle"]], i32 0, align 1
+  call void @llvm.dbg.declare(metadata ptr %b11, metadata !13, metadata !DIExpression()), !dbg !21
+  %call = call i1 @_ZNSt16coroutine_handleIvEcvbEv(ptr %b11), !dbg !21
+  ret void
+}
+
+declare i1 @_ZNSt16coroutine_handleIvEcvbEv(ptr)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 18.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "<stdin>", directory: "")
+!2 = !{!3}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = distinct !DIGlobalVariable(name: "a", scope: !0, file: !5, line: 17, type: !6, isLocal: false, isDefinition: true)
+!5 = !DIFile(filename: "bad.cpp", directory: "")
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "coroutine_handle<void>", scope: !7, file: !5, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !8, templateParams: !8, identifier: "_ZTSSt16coroutine_handleIvE")
+!7 = !DINamespace(name: "std", scope: null)
+!8 = !{}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !DILocation(line: 31, column: 7, scope: !11)
+!11 = distinct !DISubprogram(name: "f", linkageName: "_Z1fv", scope: !5, file: !5, line: 31, type: !12, scopeLine: 31, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!12 = distinct !DISubroutineType(types: !8)
+!13 = !DILocalVariable(name: "b", scope: !14, file: !5, line: 27, type: !6)
+!14 = distinct !DILexicalBlock(scope: !15, file: !5, line: 27, column: 14)
+!15 = distinct !DILexicalBlock(scope: !16, file: !5, line: 26, column: 8)
+!16 = distinct !DISubprogram(name: "~B", linkageName: "_ZN1BD2Ev", scope: !17, file: !5, line: 26, type: !18, scopeLine: 26, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !20, retainedNodes: !8)
+!17 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "B", file: !5, line: 18, size: 8, flags: DIFlagTypePassByReference | DIFlagNonTrivial, elements: !8, identifier: "_ZTS1B")
+!18 = !DISubroutineType(types: !19)
+!19 = !{null}
+!20 = !DISubprogram(name: "~B", scope: !17, file: !5, line: 26, type: !18, scopeLine: 26, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!21 = !DILocation(line: 27, column: 14, scope: !14)
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
index 19a89fefd5269..bf51218590c2f 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
@@ -32,8 +32,8 @@
 ; CHECK:       entry:
 ; CHECK:         %j = alloca i32, align 4
 ; CHECK:         call void @llvm.dbg.declare(metadata ptr %j, metadata ![[JVAR:[0-9]+]], metadata !DIExpression()), !dbg ![[JDBGLOC:[0-9]+]]
-; CHECK:         %[[MEMORY:.*]] = call ptr @new
-; CHECK:         call void @llvm.dbg.declare(metadata ptr %[[MEMORY]], metadata ![[XVAR:[0-9]+]], metadata !DIExpression(DW_OP_plus_uconst, 32)), !dbg ![[IDBGLOC:[0-9]+]]
+; CHECK:         %[[MEMORY:.*]] = call ptr @new({{.+}}), !dbg ![[IDBGLOC:[0-9]+]]
+; CHECK:         call void @llvm.dbg.declare(metadata ptr %[[MEMORY]], metadata ![[XVAR:[0-9]+]], metadata !DIExpression(DW_OP_plus_uconst, 32)), !dbg ![[IDBGLOC]]
 ; CHECK:         call void @llvm.dbg.declare(metadata ptr %[[MEMORY]], metadata ![[IVAR:[0-9]+]], metadata !DIExpression(DW_OP_plus_uconst, 20)), !dbg ![[IDBGLOC]]
 ; CHECK:       await.ready:
 ;
@@ -49,18 +49,20 @@
 ; CHECK:       await.ready:
 ;
 ; CHECK-DAG: ![[IVAR]] = !DILocalVariable(name: "i"
-; CHECK-DAG: ![[SCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: !6, file: !1, line: 23, column: 12)
-; CHECK-DAG: ![[IDBGLOC]] = !DILocation(line: 24, column: 7, scope: ![[SCOPE]])
+; CHECK-DAG: ![[PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov"
+; CHECK-DAG: ![[BLK_SCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[PROG_SCOPE]], file: !1, line: 23, column: 12)
+; CHECK-DAG: ![[IDBGLOC]] = !DILocation(line: 23, column: 6, scope: ![[PROG_SCOPE]])
 ; CHECK-DAG: ![[XVAR]] = !DILocalVariable(name: "x"
 ; CHECK-DAG: ![[JVAR]] = !DILocalVariable(name: "j"
-; CHECK-DAG: ![[JDBGLOC]] = !DILocation(line: 32, column: 7, scope: ![[SCOPE]])
+; CHECK-DAG: ![[JDBGLOC]] = !DILocation(line: 32, column: 7, scope: ![[BLK_SCOPE]])
 
 ; CHECK-DAG: ![[XVAR_RESUME]] = !DILocalVariable(name: "x"
-; CHECK-DAG: ![[IDBGLOC_RESUME]] = !DILocation(line: 24, column: 7, scope: ![[RESUME_SCOPE:[0-9]+]])
-; CHECK-DAG: ![[RESUME_SCOPE]] = distinct !DILexicalBlock(scope: !22, file: !1, line: 23, column: 12)
+; CHECK-DAG: ![[RESUME_PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov"
+; CHECK-DAG: ![[IDBGLOC_RESUME]] = !DILocation(line: 24, column: 7, scope: ![[RESUME_BLK_SCOPE:[0-9]+]])
+; CHECK-DAG: ![[RESUME_BLK_SCOPE]] = distinct !DILexicalBlock(scope: ![[RESUME_PROG_SCOPE]], file: !1, line: 23, column: 12)
 ; CHECK-DAG: ![[IVAR_RESUME]] = !DILocalVariable(name: "i"
 ; CHECK-DAG: ![[JVAR_RESUME]] = !DILocalVariable(name: "j"
-; CHECK-DAG: ![[JDBGLOC_RESUME]] = !DILocation(line: 32, column: 7, scope: ![[RESUME_SCOPE]])
+; CHECK-DAG: ![[JDBGLOC_RESUME]] = !DILocation(line: 32, column: 7, scope: ![[RESUME_BLK_SCOPE]])
 define void @f() presplitcoroutine !dbg !8 {
 entry:
   %__promise = alloca i8, align 8
@@ -72,13 +74,13 @@ entry:
   br i1 %alloc, label %coro.alloc, label %coro.init
 
 coro.alloc:                                       ; preds = %entry
-  %size = call i64 @llvm.coro.size.i64()
-  %memory = call ptr @new(i64 %size)
-  br label %coro.init
+  %size = call i64 @llvm.coro.size.i64(), !dbg !23
+  %memory = call ptr @new(i64 %size), !dbg !23
+  br label %coro.init, !dbg !23
 
 coro.init:                                        ; preds = %coro.alloc, %entry
-  %phi.entry.alloc = phi ptr [ null, %entry ], [ %memory, %coro.alloc ]
-  %begin = call ptr @llvm.coro.begin(token %id, ptr %phi.entry.alloc)
+  %phi.entry.alloc = phi ptr [ null, %entry ], [ %memory, %coro.alloc ], !dbg !23
+  %begin = call ptr @llvm.coro.begin(token %id, ptr %phi.entry.alloc), !dbg !23
   %ready = call i1 @await_ready()
   br i1 %ready, label %init.ready, label %init.suspend
 
@@ -240,3 +242,4 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 !20 = !DILocation(line: 43, column: 3, scope: !7)
 !21 = !DILocation(line: 43, column: 8, scope: !7)
 !22 = distinct !DILexicalBlock(scope: !8, file: !1, line: 23, column: 12)
+!23 = !DILocation(line: 23, column: 6, scope: !8)

From 27091dacdd5f4a8601563f57e21f653d59169d46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 2 Jan 2024 18:42:56 +0100
Subject: [PATCH 056/313] [Orc] Temporarily disable OrcLazy debug tests on
 macOS

Test failures were reported after https://github.com/llvm/llvm-project/pull/76244 landed. Let's revisit these tests now that we have native Mach-O debug support as well.
---
 .../ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll     | 2 +-
 llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
index 079a9c25bfd72..9bac3897864f7 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
@@ -1,4 +1,4 @@
-; REQUIRES: native && target-x86_64
+; REQUIRES: native && x86_64-linux
 
 ; RUN: lli --jit-linker=rtdyld \
 ; RUN:     --generate=__dump_jit_debug_descriptor %s | FileCheck %s
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
index d7bc2dc117b7f..31fe730b74036 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
@@ -1,4 +1,4 @@
-; REQUIRES: native && target-x86_64
+; REQUIRES: native && x86_64-linux
 
 ; In-memory debug-object contains some basic DWARF
 ;

From 71f8ea3062a6b0a190835853ee77e58469763b9e Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Tue, 2 Jan 2024 22:04:43 +0400
Subject: [PATCH 057/313] [llvm-cxxfilt] Added the option --no-params (#75348)

Added -p / --no-params flag to skip demangling function parameters
similar to how it is supported by GNU c++filt tool.

There are cases when users want to demangle a large number of symbols in
bulk, for example, at startup, and do not care about function parameters
and overloads at that time. Skipping the demangling of parameter types
led to a measurable improvement in performance. Our users reported about
15% speed up with GNU c++filt and we expect similar results with
llvm-cxxfilt with this patch.
---
 libcxxabi/src/demangle/ItaniumDemangle.h     | 22 +++++++++----
 llvm/docs/CommandGuide/llvm-cxxfilt.rst      |  4 +++
 llvm/include/llvm/Demangle/Demangle.h        |  5 +--
 llvm/include/llvm/Demangle/ItaniumDemangle.h | 22 +++++++++----
 llvm/lib/Demangle/Demangle.cpp               |  5 +--
 llvm/lib/Demangle/ItaniumDemangle.cpp        |  4 +--
 llvm/test/tools/llvm-cxxfilt/no-params.test  | 34 ++++++++++++++++++++
 llvm/tools/llvm-cxxfilt/Opts.td              |  2 ++
 llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp     | 10 ++++--
 9 files changed, 87 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/tools/llvm-cxxfilt/no-params.test

diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 90f25519fad1c..5a53a18bcc5fe 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -2794,7 +2794,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseClassEnumType();
   Node *parseQualifiedType();
 
-  Node *parseEncoding();
+  Node *parseEncoding(bool ParseParams = true);
   bool parseCallOffset();
   Node *parseSpecialName();
 
@@ -2911,7 +2911,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseDestructorName();
 
   /// Top-level entry point into the parser.
-  Node *parse();
+  Node *parse(bool ParseParams = true);
 };
 
 const char* parse_discriminator(const char* first, const char* last);
@@ -5405,7 +5405,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
 //            ::= <data name>
 //            ::= <special-name>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
   // The template parameters of an encoding are unrelated to those of the
   // enclosing context.
   SaveTemplateParams SaveTemplateParamsScope(this);
@@ -5431,6 +5431,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   if (IsEndOfEncoding())
     return Name;
 
+  // ParseParams may be false at the top level only, when called from parse().
+  // For example in the mangled name _Z3fooILZ3BarEET_f, ParseParams may be
+  // false when demangling 3fooILZ3BarEET_f but is always true when demangling
+  // 3Bar.
+  if (!ParseParams) {
+    while (consume())
+      ;
+    return Name;
+  }
+
   Node *Attrs = nullptr;
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
@@ -5895,9 +5905,9 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parse() {
+Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
   if (consumeIf("_Z") || consumeIf("__Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5911,7 +5921,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parse() {
   }
 
   if (consumeIf("___Z") || consumeIf("____Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
diff --git a/llvm/docs/CommandGuide/llvm-cxxfilt.rst b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
index 3f7deb1719511..0933f0b5bed87 100644
--- a/llvm/docs/CommandGuide/llvm-cxxfilt.rst
+++ b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
@@ -48,6 +48,10 @@ OPTIONS
 
   Print a summary of command line options.
 
+.. option:: --no-params, -p
+
+  Do not demangle function parameters or return types.
+
 .. option:: --no-strip-underscore, -n
 
   Do not strip a leading underscore. This is the default for all platforms
diff --git a/llvm/include/llvm/Demangle/Demangle.h b/llvm/include/llvm/Demangle/Demangle.h
index 70cfc1418f0c7..fe129603c0785 100644
--- a/llvm/include/llvm/Demangle/Demangle.h
+++ b/llvm/include/llvm/Demangle/Demangle.h
@@ -32,7 +32,7 @@ enum : int {
 /// Returns a non-NULL pointer to a NUL-terminated C style string
 /// that should be explicitly freed, if successful. Otherwise, may return
 /// nullptr if mangled_name is not a valid mangling or is nullptr.
-char *itaniumDemangle(std::string_view mangled_name);
+char *itaniumDemangle(std::string_view mangled_name, bool ParseParams = true);
 
 enum MSDemangleFlags {
   MSDF_None = 0,
@@ -68,7 +68,8 @@ char *dlangDemangle(std::string_view MangledName);
 std::string demangle(std::string_view MangledName);
 
 bool nonMicrosoftDemangle(std::string_view MangledName, std::string &Result,
-                          bool CanHaveLeadingDot = true);
+                          bool CanHaveLeadingDot = true,
+                          bool ParseParams = true);
 
 /// "Partial" demangler. This supports demangling a string into an AST
 /// (typically an intermediate stage in itaniumDemangle) and querying certain
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index e0ff035d47cfb..06956f47c1f0b 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -2793,7 +2793,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseClassEnumType();
   Node *parseQualifiedType();
 
-  Node *parseEncoding();
+  Node *parseEncoding(bool ParseParams = true);
   bool parseCallOffset();
   Node *parseSpecialName();
 
@@ -2910,7 +2910,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseDestructorName();
 
   /// Top-level entry point into the parser.
-  Node *parse();
+  Node *parse(bool ParseParams = true);
 };
 
 const char* parse_discriminator(const char* first, const char* last);
@@ -5404,7 +5404,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
 //            ::= <data name>
 //            ::= <special-name>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
   // The template parameters of an encoding are unrelated to those of the
   // enclosing context.
   SaveTemplateParams SaveTemplateParamsScope(this);
@@ -5430,6 +5430,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   if (IsEndOfEncoding())
     return Name;
 
+  // ParseParams may be false at the top level only, when called from parse().
+  // For example in the mangled name _Z3fooILZ3BarEET_f, ParseParams may be
+  // false when demangling 3fooILZ3BarEET_f but is always true when demangling
+  // 3Bar.
+  if (!ParseParams) {
+    while (consume())
+      ;
+    return Name;
+  }
+
   Node *Attrs = nullptr;
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
@@ -5894,9 +5904,9 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parse() {
+Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
   if (consumeIf("_Z") || consumeIf("__Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5910,7 +5920,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parse() {
   }
 
   if (consumeIf("___Z") || consumeIf("____Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
diff --git a/llvm/lib/Demangle/Demangle.cpp b/llvm/lib/Demangle/Demangle.cpp
index 83f3cdc88c01e..117b849d1c784 100644
--- a/llvm/lib/Demangle/Demangle.cpp
+++ b/llvm/lib/Demangle/Demangle.cpp
@@ -47,7 +47,8 @@ static bool isRustEncoding(std::string_view S) { return starts_with(S, "_R"); }
 static bool isDLangEncoding(std::string_view S) { return starts_with(S, "_D"); }
 
 bool llvm::nonMicrosoftDemangle(std::string_view MangledName,
-                                std::string &Result, bool CanHaveLeadingDot) {
+                                std::string &Result, bool CanHaveLeadingDot,
+                                bool ParseParams) {
   char *Demangled = nullptr;
 
   // Do not consider the dot prefix as part of the demangled symbol name.
@@ -57,7 +58,7 @@ bool llvm::nonMicrosoftDemangle(std::string_view MangledName,
   }
 
   if (isItaniumEncoding(MangledName))
-    Demangled = itaniumDemangle(MangledName);
+    Demangled = itaniumDemangle(MangledName, ParseParams);
   else if (isRustEncoding(MangledName))
     Demangled = rustDemangle(MangledName);
   else if (isDLangEncoding(MangledName))
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index e3f208f0adf8d..5c21b06a1d095 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -366,13 +366,13 @@ class DefaultAllocator {
 
 using Demangler = itanium_demangle::ManglingParser<DefaultAllocator>;
 
-char *llvm::itaniumDemangle(std::string_view MangledName) {
+char *llvm::itaniumDemangle(std::string_view MangledName, bool ParseParams) {
   if (MangledName.empty())
     return nullptr;
 
   Demangler Parser(MangledName.data(),
                    MangledName.data() + MangledName.length());
-  Node *AST = Parser.parse();
+  Node *AST = Parser.parse(ParseParams);
   if (!AST)
     return nullptr;
 
diff --git a/llvm/test/tools/llvm-cxxfilt/no-params.test b/llvm/test/tools/llvm-cxxfilt/no-params.test
new file mode 100644
index 0000000000000..17cbbbe2242bc
--- /dev/null
+++ b/llvm/test/tools/llvm-cxxfilt/no-params.test
@@ -0,0 +1,34 @@
+RUN: llvm-cxxfilt             _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-PARAMS
+RUN: llvm-cxxfilt          -p _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-NO-PARAMS --match-full-lines
+RUN: llvm-cxxfilt --no-params _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-NO-PARAMS --match-full-lines
+
+# Check that -p or --no-params flag omits function parameters and the return
+# type.
+
+CHECK-PARAMS: Bar foo<Bar>(float)
+CHECK-NO-PARAMS: foo<Bar>
+
+# Check that only the top-level function is impacted by the switch, and that
+# nested function types in the encoding (e.g. where a function type is being
+# used as a template parameter) still include their parameters.
+#
+# template <typename T> T foo(double);
+# typedef char (*F)(float);
+# F foo<F>(double)
+
+CHECK-PARAMS: char (*foo<char (*)(float)>(double))(float)
+CHECK-NO-PARAMS: foo<char (*)(float)>
+
+# Use an invalid mangled name broken in the function parameters to check how -p
+# or --no-params flag works. If the option is given we should be able to
+# demangle the function name just fine. If it is not given, demangling will fail
+# because of the invalid params.
+
+CHECK-PARAMS: _ZN1f2baC2ERKNS_2baIT_EE
+CHECK-NO-PARAMS: f::ba::ba
+
+# Check that a vendor specific suffix is also omitted when --no-params is
+# specified. This matches c++filt's behaviour.
+
+CHECK-PARAMS: foo() (.123)
+CHECK-NO-PARAMS: foo
diff --git a/llvm/tools/llvm-cxxfilt/Opts.td b/llvm/tools/llvm-cxxfilt/Opts.td
index f652a1a7f88bb..034cb267aab80 100644
--- a/llvm/tools/llvm-cxxfilt/Opts.td
+++ b/llvm/tools/llvm-cxxfilt/Opts.td
@@ -17,6 +17,7 @@ multiclass Eq<string name, string help> {
 def help : FF<"help", "Display this help">;
 defm strip_underscore : BB<"strip-underscore", "Strip the leading underscore", "Don't strip the leading underscore">;
 def types : FF<"types", "Attempt to demangle types as well as function names">;
+def no_params : FF<"no-params", "Skip function parameters and return types">;
 def version : FF<"version", "Display the version">;
 
 defm : Eq<"format", "Specify mangling format. Currently ignored because only 'gnu' is supported">;
@@ -25,4 +26,5 @@ def : F<"s", "Alias for --format">;
 def : F<"_", "Alias for --strip-underscore">, Alias<strip_underscore>;
 def : F<"h", "Alias for --help">, Alias<help>;
 def : F<"n", "Alias for --no-strip-underscore">, Alias<no_strip_underscore>;
+def : F<"p", "Alias for --no-params">, Alias<no_params>;
 def : F<"t", "Alias for --types">, Alias<types>;
diff --git a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
index 4b9d88a650666..26a1f2f4afebd 100644
--- a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
+++ b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
@@ -54,6 +54,7 @@ class CxxfiltOptTable : public opt::GenericOptTable {
 };
 } // namespace
 
+static bool ParseParams;
 static bool StripUnderscore;
 static bool Types;
 
@@ -74,18 +75,19 @@ static std::string demangle(const std::string &Mangled) {
   }
 
   std::string Result;
-  if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot))
+  if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot,
+                           ParseParams))
     return Result;
 
   std::string Prefix;
   char *Undecorated = nullptr;
 
   if (Types)
-    Undecorated = itaniumDemangle(DecoratedStr);
+    Undecorated = itaniumDemangle(DecoratedStr, ParseParams);
 
   if (!Undecorated && starts_with(DecoratedStr, "__imp_")) {
     Prefix = "import thunk for ";
-    Undecorated = itaniumDemangle(DecoratedStr.substr(6));
+    Undecorated = itaniumDemangle(DecoratedStr.substr(6), ParseParams);
   }
 
   Result = Undecorated ? Prefix + Undecorated : Mangled;
@@ -173,6 +175,8 @@ int llvm_cxxfilt_main(int argc, char **argv, const llvm::ToolContext &) {
   else
     StripUnderscore = Triple(sys::getProcessTriple()).isOSBinFormatMachO();
 
+  ParseParams = !Args.hasArg(OPT_no_params);
+
   Types = Args.hasArg(OPT_types);
 
   std::vector<std::string> Decorated = Args.getAllArgValues(OPT_INPUT);

From ffd173ba0b4a6d84f45308e78cea4af611bec10e Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251@gmail.com>
Date: Tue, 2 Jan 2024 13:06:13 -0500
Subject: [PATCH 058/313] [lldb-dap] Emit more structured info along with
 variables (#75244)

In order to allow smarter vscode extensions, it's useful to send
additional structured information of SBValues to the client.
Specifically, I'm now sending error, summary, autoSummary and
inMemoryValue in addition to the existing properties being sent. This is
cheap because these properties have to be calculated anyway to generate
the display value of the variable, but they are now available for
extensions to better analyze variables. For example, if the error field
is not present, the extension might be able to provide cool features,
and the current way to do that is to look for the `"<error: "` prefix,
which is error-prone.

This also incorporates a tiny feedback from
https://github.com/llvm/llvm-project/pull/74865#issuecomment-1850695477
---
 .../lldb-dap/optimized/TestDAP_optimized.py   |   8 +-
 .../lldb-dap/variables/TestDAP_variables.py   |  24 +-
 lldb/tools/lldb-dap/BreakpointBase.cpp        |   2 +-
 lldb/tools/lldb-dap/JSONUtils.cpp             | 269 +++++++++++-------
 lldb/tools/lldb-dap/JSONUtils.h               |  60 ++--
 lldb/tools/lldb-dap/lldb-dap.cpp              |  12 +-
 6 files changed, 225 insertions(+), 150 deletions(-)

diff --git a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
index cc544919b6a82..418a6225febd0 100644
--- a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
+++ b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
@@ -3,10 +3,10 @@
 """
 
 import dap_server
+import lldbdap_testcase
+from lldbsuite.test import lldbutil
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
-import lldbdap_testcase
 
 
 class TestDAP_optimized(lldbdap_testcase.DAPTestCaseBase):
@@ -47,3 +47,7 @@ def test_optimized_variable(self):
         optimized_variable = self.dap_server.get_local_variable("argc")
 
         self.assertTrue(optimized_variable["value"].startswith("<error:"))
+        self.assertEqual(
+            optimized_variable["$__lldb_extensions"]["error"],
+            "Could not evaluate DW_OP_entry_value.",
+        )
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 9b0755eea7d3e..58d6b941ac81a 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -153,10 +153,18 @@ def do_test_scopes_variables_setVariable_evaluate(
         buffer_children = make_buffer_verify_dict(0, 32)
         verify_locals = {
             "argc": {
-                "equals": {"type": "int", "value": "1"},
-                "declaration": {
-                    "equals": {"line": 12, "column": 14},
-                    "contains": {"path": ["lldb-dap", "variables", "main.cpp"]},
+                "equals": {
+                    "type": "int",
+                    "value": "1",
+                },
+                "$__lldb_extensions": {
+                    "equals": {
+                        "value": "1",
+                    },
+                    "declaration": {
+                        "equals": {"line": 12, "column": 14},
+                        "contains": {"path": ["lldb-dap", "variables", "main.cpp"]},
+                    },
                 },
             },
             "argv": {
@@ -165,7 +173,9 @@ def do_test_scopes_variables_setVariable_evaluate(
                 "hasVariablesReference": True,
             },
             "pt": {
-                "equals": {"type": "PointType"},
+                "equals": {
+                    "type": "PointType",
+                },
                 "hasVariablesReference": True,
                 "children": {
                     "x": {"equals": {"type": "int", "value": "11"}},
@@ -175,6 +185,10 @@ def do_test_scopes_variables_setVariable_evaluate(
             },
             "x": {"equals": {"type": "int"}},
         }
+        if enableAutoVariableSummaries:
+            verify_locals["pt"]["$__lldb_extensions"] = {
+                "equals": {"autoSummary": "{x:11, y:22}"}
+            }
         verify_globals = {
             "s_local": {"equals": {"type": "float", "value": "2.25"}},
             "::g_global": {"equals": {"type": "int", "value": "123"}},
diff --git a/lldb/tools/lldb-dap/BreakpointBase.cpp b/lldb/tools/lldb-dap/BreakpointBase.cpp
index bc9bde97678ad..fb4b27fbe315f 100644
--- a/lldb/tools/lldb-dap/BreakpointBase.cpp
+++ b/lldb/tools/lldb-dap/BreakpointBase.cpp
@@ -296,7 +296,7 @@ bool BreakpointBase::BreakpointHitCallback(
           frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget);
       if (value.GetError().Fail())
         value = frame.EvaluateExpression(expr);
-      output += ValueToString(value);
+      output += VariableDescription(value).display_value;
     } else {
       output += messagePart.text;
     }
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index a0a175f960bcf..df17ac9d84917 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -211,46 +211,6 @@ static std::optional<std::string> TryCreateAutoSummary(lldb::SBValue value) {
   return TryCreateAutoSummaryForContainer(value);
 }
 
-std::string ValueToString(lldb::SBValue v) {
-  std::string result;
-  llvm::raw_string_ostream strm(result);
-
-  lldb::SBError error = v.GetError();
-  if (!error.Success()) {
-    strm << "<error: " << error.GetCString() << ">";
-  } else {
-    llvm::StringRef value = v.GetValue();
-    llvm::StringRef nonAutoSummary = v.GetSummary();
-    std::optional<std::string> summary = !nonAutoSummary.empty()
-                                             ? nonAutoSummary.str()
-                                             : TryCreateAutoSummary(v);
-    if (!value.empty()) {
-      strm << value;
-      if (summary)
-        strm << ' ' << *summary;
-    } else if (summary) {
-      strm << *summary;
-
-      // As last resort, we print its type and address if available.
-    } else {
-      if (llvm::StringRef type_name = v.GetType().GetDisplayTypeName();
-          !type_name.empty()) {
-        strm << type_name;
-        lldb::addr_t address = v.GetLoadAddress();
-        if (address != LLDB_INVALID_ADDRESS)
-          strm << " @ " << llvm::format_hex(address, 0);
-      }
-    }
-  }
-  return result;
-}
-
-void SetValueForKey(lldb::SBValue &v, llvm::json::Object &object,
-                    llvm::StringRef key) {
-  std::string result = ValueToString(v);
-  EmplaceSafeString(object, key, result);
-}
-
 void FillResponse(const llvm::json::Object &request,
                   llvm::json::Object &response) {
   // Fill in all of the needed response fields to a "request" and set "success"
@@ -1045,6 +1005,92 @@ std::string CreateUniqueVariableNameForDisplay(lldb::SBValue v,
   return name_builder.GetData();
 }
 
+VariableDescription::VariableDescription(lldb::SBValue v, bool format_hex,
+                                         bool is_name_duplicated,
+                                         std::optional<std::string> custom_name)
+    : v(v) {
+  name = custom_name
+             ? *custom_name
+             : CreateUniqueVariableNameForDisplay(v, is_name_duplicated);
+
+  type_obj = v.GetType();
+  std::string raw_display_type_name =
+      llvm::StringRef(type_obj.GetDisplayTypeName()).str();
+  display_type_name =
+      !raw_display_type_name.empty() ? raw_display_type_name : NO_TYPENAME;
+
+  if (format_hex)
+    v.SetFormat(lldb::eFormatHex);
+
+  llvm::raw_string_ostream os_display_value(display_value);
+
+  if (lldb::SBError sb_error = v.GetError(); sb_error.Fail()) {
+    error = sb_error.GetCString();
+    os_display_value << "<error: " << error << ">";
+  } else {
+    value = llvm::StringRef(v.GetValue()).str();
+    summary = llvm::StringRef(v.GetSummary()).str();
+    if (summary.empty())
+      auto_summary = TryCreateAutoSummary(v);
+
+    std::optional<std::string> effective_summary =
+        !summary.empty() ? summary : auto_summary;
+
+    if (!value.empty()) {
+      os_display_value << value;
+      if (effective_summary)
+        os_display_value << " " << *effective_summary;
+    } else if (effective_summary) {
+      os_display_value << *effective_summary;
+
+      // As last resort, we print its type and address if available.
+    } else {
+      if (!raw_display_type_name.empty()) {
+        os_display_value << raw_display_type_name;
+        lldb::addr_t address = v.GetLoadAddress();
+        if (address != LLDB_INVALID_ADDRESS)
+          os_display_value << " @ " << llvm::format_hex(address, 0);
+      }
+    }
+  }
+
+  lldb::SBStream evaluateStream;
+  v.GetExpressionPath(evaluateStream);
+  evaluate_name = llvm::StringRef(evaluateStream.GetData()).str();
+}
+
+llvm::json::Object VariableDescription::GetVariableExtensionsJSON() {
+  llvm::json::Object extensions;
+  if (error)
+    EmplaceSafeString(extensions, "error", *error);
+  if (!value.empty())
+    EmplaceSafeString(extensions, "value", value);
+  if (!summary.empty())
+    EmplaceSafeString(extensions, "summary", summary);
+  if (auto_summary)
+    EmplaceSafeString(extensions, "autoSummary", *auto_summary);
+
+  if (lldb::SBDeclaration decl = v.GetDeclaration(); decl.IsValid()) {
+    llvm::json::Object decl_obj;
+    if (lldb::SBFileSpec file = decl.GetFileSpec(); file.IsValid()) {
+      char path[PATH_MAX] = "";
+      if (file.GetPath(path, sizeof(path)) &&
+          lldb::SBFileSpec::ResolvePath(path, path, PATH_MAX)) {
+        decl_obj.try_emplace("path", std::string(path));
+      }
+    }
+
+    if (int line = decl.GetLine())
+      decl_obj.try_emplace("line", line);
+    if (int column = decl.GetColumn())
+      decl_obj.try_emplace("column", column);
+
+    if (!decl_obj.empty())
+      extensions.try_emplace("declaration", std::move(decl_obj));
+  }
+  return extensions;
+}
+
 // "Variable": {
 //   "type": "object",
 //   "description": "A Variable is a name/value pair. Optionally a variable
@@ -1104,27 +1150,56 @@ std::string CreateUniqueVariableNameForDisplay(lldb::SBValue v,
 //                       can use this optional information to present the
 //                       children in a paged UI and fetch them in chunks."
 //     }
-//     "declaration": {
-//       "type": "object | undefined",
-//       "description": "Extension to the protocol that indicates the source
-//                       location where the variable was declared. This value
-//                       might not be present if no declaration is available.",
+//
+//
+//     "$__lldb_extensions": {
+//       "description": "Unofficial extensions to the protocol",
 //       "properties": {
-//         "path": {
-//           "type": "string | undefined",
-//           "description": "The source file path where the variable was
-//                           declared."
-//         },
-//         "line": {
-//           "type": "number | undefined",
-//           "description": "The 1-indexed source line where the variable was
-//                          declared."
-//         },
-//         "column": {
-//           "type": "number | undefined",
-//           "description": "The 1-indexed source column where the variable was
-//                          declared."
+//         "declaration": {
+//         "type": "object",
+//         "description": "The source location where the variable was declared.
+//                         This value won't be present if no declaration is
+//                         available.",
+//         "properties": {
+//           "path": {
+//             "type": "string",
+//             "description": "The source file path where the variable was
+//                            declared."
+//           },
+//           "line": {
+//             "type": "number",
+//             "description": "The 1-indexed source line where the variable was
+//                             declared."
+//           },
+//           "column": {
+//             "type": "number",
+//             "description": "The 1-indexed source column where the variable
+//                             was declared."
+//           }
 //         }
+//       },
+//       "value":
+//         "type": "string",
+//         "description": "The internal value of the variable as returned by
+//                         This is effectively SBValue.GetValue(). The other
+//                         `value` entry in the top-level variable response is,
+//                          on the other hand, just a display string for the
+//                          variable."
+//       },
+//       "summary":
+//         "type": "string",
+//         "description": "The summary string of the variable. This is
+//                         effectively SBValue.GetSummary()."
+//       },
+//       "autoSummary":
+//         "type": "string",
+//         "description": "The auto generated summary if using
+//                         `enableAutoVariableSummaries`."
+//       },
+//       "error":
+//         "type": "string",
+//         "description": "An error message generated if LLDB couldn't inspect
+//                         the variable."
 //       }
 //     }
 //   },
@@ -1134,81 +1209,57 @@ llvm::json::Value CreateVariable(lldb::SBValue v, int64_t variablesReference,
                                  int64_t varID, bool format_hex,
                                  bool is_name_duplicated,
                                  std::optional<std::string> custom_name) {
+  VariableDescription desc(v, format_hex, is_name_duplicated, custom_name);
   llvm::json::Object object;
-  EmplaceSafeString(
-      object, "name",
-      custom_name ? *custom_name
-                  : CreateUniqueVariableNameForDisplay(v, is_name_duplicated));
+  EmplaceSafeString(object, "name", desc.name);
+  EmplaceSafeString(object, "value", desc.display_value);
+
+  if (!desc.evaluate_name.empty())
+    EmplaceSafeString(object, "evaluateName", desc.evaluate_name);
 
-  if (format_hex)
-    v.SetFormat(lldb::eFormatHex);
-  SetValueForKey(v, object, "value");
-  auto type_obj = v.GetType();
-  auto type_cstr = type_obj.GetDisplayTypeName();
   // If we have a type with many children, we would like to be able to
   // give a hint to the IDE that the type has indexed children so that the
-  // request can be broken up in grabbing only a few children at a time. We want
-  // to be careful and only call "v.GetNumChildren()" if we have an array type
-  // or if we have a synthetic child provider. We don't want to call
-  // "v.GetNumChildren()" on all objects as class, struct and union types don't
-  // need to be completed if they are never expanded. So we want to avoid
-  // calling this to only cases where we it makes sense to keep performance high
-  // during normal debugging.
-
-  // If we have an array type, say that it is indexed and provide the number of
-  // children in case we have a huge array. If we don't do this, then we might
-  // take a while to produce all children at onces which can delay your debug
-  // session.
-  const bool is_array = type_obj.IsArrayType();
+  // request can be broken up in grabbing only a few children at a time. We
+  // want to be careful and only call "v.GetNumChildren()" if we have an array
+  // type or if we have a synthetic child provider. We don't want to call
+  // "v.GetNumChildren()" on all objects as class, struct and union types
+  // don't need to be completed if they are never expanded. So we want to
+  // avoid calling this to only cases where we it makes sense to keep
+  // performance high during normal debugging.
+
+  // If we have an array type, say that it is indexed and provide the number
+  // of children in case we have a huge array. If we don't do this, then we
+  // might take a while to produce all children at onces which can delay your
+  // debug session.
+  const bool is_array = desc.type_obj.IsArrayType();
   const bool is_synthetic = v.IsSynthetic();
   if (is_array || is_synthetic) {
     const auto num_children = v.GetNumChildren();
     // We create a "[raw]" fake child for each synthetic type, so we have to
-    // account for it when returning indexed variables. We don't need to do this
-    // for non-indexed ones.
+    // account for it when returning indexed variables. We don't need to do
+    // this for non-indexed ones.
     bool has_raw_child = is_synthetic && g_dap.enable_synthetic_child_debugging;
     int actual_num_children = num_children + (has_raw_child ? 1 : 0);
     if (is_array) {
       object.try_emplace("indexedVariables", actual_num_children);
     } else if (num_children > 0) {
-      // If a type has a synthetic child provider, then the SBType of "v" won't
-      // tell us anything about what might be displayed. So we can check if the
-      // first child's name is "[0]" and then we can say it is indexed.
+      // If a type has a synthetic child provider, then the SBType of "v"
+      // won't tell us anything about what might be displayed. So we can check
+      // if the first child's name is "[0]" and then we can say it is indexed.
       const char *first_child_name = v.GetChildAtIndex(0).GetName();
       if (first_child_name && strcmp(first_child_name, "[0]") == 0)
         object.try_emplace("indexedVariables", actual_num_children);
     }
   }
-  EmplaceSafeString(object, "type", type_cstr ? type_cstr : NO_TYPENAME);
+  EmplaceSafeString(object, "type", desc.display_type_name);
   if (varID != INT64_MAX)
     object.try_emplace("id", varID);
   if (v.MightHaveChildren())
     object.try_emplace("variablesReference", variablesReference);
   else
     object.try_emplace("variablesReference", (int64_t)0);
-  lldb::SBStream evaluateStream;
-  v.GetExpressionPath(evaluateStream);
-  const char *evaluateName = evaluateStream.GetData();
-  if (evaluateName && evaluateName[0])
-    EmplaceSafeString(object, "evaluateName", std::string(evaluateName));
-
-  if (lldb::SBDeclaration decl = v.GetDeclaration(); decl.IsValid()) {
-    llvm::json::Object decl_obj;
-    if (lldb::SBFileSpec file = decl.GetFileSpec(); file.IsValid()) {
-      char path[PATH_MAX] = "";
-      if (file.GetPath(path, sizeof(path)) &&
-          lldb::SBFileSpec::ResolvePath(path, path, PATH_MAX)) {
-        decl_obj.try_emplace("path", std::string(path));
-      }
-    }
 
-    if (int line = decl.GetLine())
-      decl_obj.try_emplace("line", line);
-    if (int column = decl.GetColumn())
-      decl_obj.try_emplace("column", column);
-
-    object.try_emplace("declaration", std::move(decl_obj));
-  }
+  object.try_emplace("$__lldb_extensions", desc.GetVariableExtensionsJSON());
   return llvm::json::Value(std::move(object));
 }
 
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 02ee499445fa1..7f9fc7037c63d 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -167,33 +167,6 @@ std::vector<std::string> GetStrings(const llvm::json::Object *obj,
 void FillResponse(const llvm::json::Object &request,
                   llvm::json::Object &response);
 
-/// Utility function to convert SBValue \v into a string.
-std::string ValueToString(lldb::SBValue v);
-
-/// Emplace the string value from an SBValue into the supplied object
-/// using \a key as the key that will contain the value.
-///
-/// The value is what we will display in VS Code. Some SBValue objects
-/// can have a value and/or a summary. If a value has both, we
-/// combine the value and the summary into one string. If we only have a
-/// value or summary, then that is considered the value. If there is
-/// no value and no summary then the value is the type name followed by
-/// the address of the type if it has an address.
-///
-///
-/// \param[in] v
-///     A lldb::SBValue object to extract the string value from
-///
-///
-/// \param[in] object
-///     The object to place the value object into
-///
-///
-/// \param[in] key
-///     The key name to use when inserting the value object we create
-void SetValueForKey(lldb::SBValue &v, llvm::json::Object &object,
-                    llvm::StringRef key);
-
 /// Converts \a bp to a JSON value and appends the first valid location to the
 /// \a breakpoints array.
 ///
@@ -401,6 +374,39 @@ const char *GetNonNullVariableName(lldb::SBValue value);
 std::string CreateUniqueVariableNameForDisplay(lldb::SBValue v,
                                                bool is_name_duplicated);
 
+/// Helper struct that parses the metadata of an \a lldb::SBValue and produces
+/// a canonical set of properties that can be sent to DAP clients.
+struct VariableDescription {
+  // The error message if SBValue.GetValue() fails.
+  std::optional<std::string> error;
+  // The display description to show on the IDE.
+  std::string display_value;
+  // The display name to show on the IDE.
+  std::string name;
+  // The variable path for this variable.
+  std::string evaluate_name;
+  // The output of SBValue.GetValue() if it doesn't fail. It might be empty.
+  std::string value;
+  // The summary string of this variable. It might be empty.
+  std::string summary;
+  // The auto summary if using `enableAutoVariableSummaries`.
+  std::optional<std::string> auto_summary;
+  // The type of this variable.
+  lldb::SBType type_obj;
+  // The display type name of this variable.
+  std::string display_type_name;
+  /// The SBValue for this variable.
+  lldb::SBValue v;
+
+  VariableDescription(lldb::SBValue v, bool format_hex = false,
+                      bool is_name_duplicated = false,
+                      std::optional<std::string> custom_name = {});
+
+  /// Create a JSON object that represents these extensions to the DAP variable
+  /// response.
+  llvm::json::Object GetVariableExtensionsJSON();
+};
+
 /// Create a "Variable" object for a LLDB thread object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 75b3948b5efb7..0d45b0379c852 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -1316,10 +1316,9 @@ void request_evaluate(const llvm::json::Object &request) {
       else
         EmplaceSafeString(response, "message", "evaluate failed");
     } else {
-      SetValueForKey(value, body, "result");
-      auto value_typename = value.GetType().GetDisplayTypeName();
-      EmplaceSafeString(body, "type",
-                        value_typename ? value_typename : NO_TYPENAME);
+      VariableDescription desc(value);
+      EmplaceSafeString(body, "result", desc.display_value);
+      EmplaceSafeString(body, "type", desc.display_type_name);
       if (value.MightHaveChildren()) {
         auto variableReference = g_dap.variables.InsertExpandableVariable(
             value, /*is_permanent=*/context == "repl");
@@ -3109,8 +3108,9 @@ void request_setVariable(const llvm::json::Object &request) {
     lldb::SBError error;
     bool success = variable.SetValueFromCString(value.data(), error);
     if (success) {
-      SetValueForKey(variable, body, "value");
-      EmplaceSafeString(body, "type", variable.GetType().GetDisplayTypeName());
+      VariableDescription desc(variable);
+      EmplaceSafeString(body, "result", desc.display_value);
+      EmplaceSafeString(body, "type", desc.display_type_name);
 
       // We don't know the index of the variable in our g_dap.variables
       // so always insert a new one to get its variablesReference.

From d34901f30b8b7ac8fe1437eeb588da6805b27cfb Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 2 Jan 2024 18:04:36 +0000
Subject: [PATCH 059/313] Revert "[clang][Darwin] Remove legacy framework
 search path logic in the frontend (#75841)"

This reverts commit 61999b18c407b9f5c07577e63057d41c65240e61.

See comments on https://github.com/llvm/llvm-project/pull/75841. This
was intended to be NFC but actually isn't.
---
 clang/lib/Driver/ToolChains/Darwin.cpp        | 35 +++++++------------
 clang/lib/Lex/InitHeaderSearch.cpp            | 18 ++++++++--
 clang/test/Driver/driverkit-path.c            |  1 +
 .../test/Preprocessor/cuda-macos-includes.cu  | 13 +++++++
 4 files changed, 42 insertions(+), 25 deletions(-)
 create mode 100644 clang/test/Preprocessor/cuda-macos-includes.cu

diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index f76a42d2d8e7e..65846cace461e 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -758,14 +758,9 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  // Add framework include paths and library search paths.
-  // There are two flavors:
-  // 1. The "non-standard" paths, e.g. for DriverKit:
-  //      -L<sysroot>/System/DriverKit/usr/lib
-  //      -F<sysroot>/System/DriverKit/System/Library/Frameworks
-  // 2. The "standard" paths, e.g. for macOS and iOS:
-  //      -F<sysroot>/System/Library/Frameworks
-  //      -F<sysroot>/Library/Frameworks
+  // Add non-standard, platform-specific search paths, e.g., for DriverKit:
+  //  -L<sysroot>/System/DriverKit/usr/lib
+  //  -F<sysroot>/System/DriverKit/System/Library/Framework
   {
     bool NonStandardSearchPath = false;
     const auto &Triple = getToolChain().getTriple();
@@ -776,22 +771,18 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
           (Version.getMajor() == 605 && Version.getMinor().value_or(0) < 1);
     }
 
-    if (auto *Sysroot = Args.getLastArg(options::OPT_isysroot)) {
-      auto AddSearchPath = [&](StringRef Flag, StringRef SearchPath) {
-        SmallString<128> P(Sysroot->getValue());
-        AppendPlatformPrefix(P, Triple);
-        llvm::sys::path::append(P, SearchPath);
-        if (getToolChain().getVFS().exists(P)) {
-          CmdArgs.push_back(Args.MakeArgString(Flag + P));
-        }
-      };
-
-      if (NonStandardSearchPath) {
+    if (NonStandardSearchPath) {
+      if (auto *Sysroot = Args.getLastArg(options::OPT_isysroot)) {
+        auto AddSearchPath = [&](StringRef Flag, StringRef SearchPath) {
+          SmallString<128> P(Sysroot->getValue());
+          AppendPlatformPrefix(P, Triple);
+          llvm::sys::path::append(P, SearchPath);
+          if (getToolChain().getVFS().exists(P)) {
+            CmdArgs.push_back(Args.MakeArgString(Flag + P));
+          }
+        };
         AddSearchPath("-L", "/usr/lib");
         AddSearchPath("-F", "/System/Library/Frameworks");
-      } else if (!Triple.isDriverKit()) {
-        AddSearchPath("-F", "/System/Library/Frameworks");
-        AddSearchPath("-F", "/Library/Frameworks");
       }
     }
   }
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 1350fa5f01a57..2218db15013d9 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -324,9 +324,6 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
     break;
   }
 
-  if (triple.isOSDarwin())
-    return false;
-
   return true; // Everything else uses AddDefaultIncludePaths().
 }
 
@@ -341,6 +338,21 @@ void InitHeaderSearch::AddDefaultIncludePaths(
   if (!ShouldAddDefaultIncludePaths(triple))
     return;
 
+  // NOTE: some additional header search logic is handled in the driver for
+  // Darwin.
+  if (triple.isOSDarwin()) {
+    if (HSOpts.UseStandardSystemIncludes) {
+      // Add the default framework include paths on Darwin.
+      if (triple.isDriverKit()) {
+        AddPath("/System/DriverKit/System/Library/Frameworks", System, true);
+      } else {
+        AddPath("/System/Library/Frameworks", System, true);
+        AddPath("/Library/Frameworks", System, true);
+      }
+    }
+    return;
+  }
+
   if (Lang.CPlusPlus && !Lang.AsmPreprocessor &&
       HSOpts.UseStandardCXXIncludes && HSOpts.UseStandardSystemIncludes) {
     if (HSOpts.UseLibcxx) {
diff --git a/clang/test/Driver/driverkit-path.c b/clang/test/Driver/driverkit-path.c
index 43e5aa40fc6f3..9699b9c01f4e8 100644
--- a/clang/test/Driver/driverkit-path.c
+++ b/clang/test/Driver/driverkit-path.c
@@ -31,3 +31,4 @@ int main() { return 0; }
 // INC:       [[PATH]]/System/DriverKit/usr/local/include
 // INC:       /lib{{(64)?}}/clang/{{[^/ ]+}}/include
 // INC:       [[PATH]]/System/DriverKit/usr/include
+// INC:       [[PATH]]/System/DriverKit/System/Library/Frameworks (framework directory)
diff --git a/clang/test/Preprocessor/cuda-macos-includes.cu b/clang/test/Preprocessor/cuda-macos-includes.cu
new file mode 100644
index 0000000000000..6ef94b0e45352
--- /dev/null
+++ b/clang/test/Preprocessor/cuda-macos-includes.cu
@@ -0,0 +1,13 @@
+// RUN: %clang -cc1 -fcuda-is-device -isysroot /var/empty \
+// RUN:   -triple nvptx-nvidia-cuda -aux-triple i386-apple-macosx \
+// RUN:   -E -fcuda-is-device -v -o /dev/null -x cuda %s 2>&1 | FileCheck %s
+
+// RUN: %clang -cc1 -isysroot /var/empty \
+// RUN:   -triple i386-apple-macosx -aux-triple nvptx-nvidia-cuda \
+// RUN:   -E -fcuda-is-device -v -o /dev/null -x cuda %s 2>&1 | FileCheck %s
+
+// Check that when we do CUDA host and device compiles on MacOS, we check for
+// includes in /System/Library/Frameworks and /Library/Frameworks.
+
+// CHECK-DAG: ignoring nonexistent directory "/var/empty/System/Library/Frameworks"
+// CHECK-DAG: ignoring nonexistent directory "/var/empty/Library/Frameworks"

From e512df3ecca4b2328ac1efd7de6c6efefde249d7 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Tue, 2 Jan 2024 19:14:16 +0100
Subject: [PATCH 060/313] [LV] Fix crash when vectorizing function calls with
 linear args. (#76274)

llvm/lib/IR/Type.cpp:694:
    Assertion `isValidElementType(ElementType) && "Element type of a
    VectorType must be an integer, floating point, or pointer type."'
    failed.
Stack dump:
    llvm::FixedVectorType::get(llvm::Type*, unsigned int)
    llvm::VPWidenCallRecipe::execute(llvm::VPTransformState&)
    llvm::VPBasicBlock::execute(llvm::VPTransformState*)
    llvm::VPRegionBlock::execute(llvm::VPTransformState*)
    llvm::VPlan::execute(llvm::VPTransformState*)
    ...

Happens with function calls of void return type.
---
 llvm/lib/Analysis/VectorUtils.cpp             |  2 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 12 ++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 ++--
 .../AArch64/vector-call-linear-args.ll        | 70 ++++++++++++++-----
 4 files changed, 67 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index f90fca9d937fc..5b57f0a25cec8 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -123,6 +123,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
 
 bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
                                                   int OpdIdx) {
+  assert(ID != Intrinsic::not_intrinsic && "Not an intrinsic!");
+
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ced081d429164..3049915260646 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11596,10 +11596,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       CallInst *CI = cast<CallInst>(VL0);
       setInsertPointAfterBundle(E);
 
-      Intrinsic::ID IID = Intrinsic::not_intrinsic;
-      if (Function *FI = CI->getCalledFunction())
-        IID = FI->getIntrinsicID();
-
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
@@ -11610,18 +11606,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       SmallVector<Value *> OpVecs;
       SmallVector<Type *, 2> TysForDecl;
       // Add return type if intrinsic is overloaded on it.
-      if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))
+      if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
         TysForDecl.push_back(
             FixedVectorType::get(CI->getType(), E->Scalars.size()));
       for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
-        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
           CallInst *CEI = cast<CallInst>(VL0);
           ScalarArg = CEI->getArgOperand(I);
           OpVecs.push_back(CEI->getArgOperand(I));
-          if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))
+          if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
             TysForDecl.push_back(ScalarArg->getType());
           continue;
         }
@@ -11633,7 +11629,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         }
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
-        if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))
+        if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
           TysForDecl.push_back(OpVec->getType());
       }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 991491de20892..76961629aeceb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -498,16 +498,17 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
   State.setDebugLocFrom(CI.getDebugLoc());
 
+  bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
   FunctionType *VFTy = nullptr;
   if (Variant)
     VFTy = Variant->getFunctionType();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     SmallVector<Type *, 2> TysForDecl;
     // Add return type if intrinsic is overloaded on it.
-    if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) {
+    if (UseIntrinsic &&
+        isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
       TysForDecl.push_back(
           VectorType::get(CI.getType()->getScalarType(), State.VF));
-    }
     SmallVector<Value *, 4> Args;
     for (const auto &I : enumerate(operands())) {
       // Some intrinsics have a scalar argument - don't replace it with a
@@ -516,18 +517,19 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
       // e.g. linear parameters for pointers.
       Value *Arg;
       if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
-          (VectorIntrinsicID != Intrinsic::not_intrinsic &&
+          (UseIntrinsic &&
            isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
         Arg = State.get(I.value(), VPIteration(0, 0));
       else
         Arg = State.get(I.value(), Part);
-      if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
+      if (UseIntrinsic &&
+          isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
         TysForDecl.push_back(Arg->getType());
       Args.push_back(Arg);
     }
 
     Function *VectorF;
-    if (VectorIntrinsicID != Intrinsic::not_intrinsic) {
+    if (UseIntrinsic) {
       // Use vector version of the intrinsic.
       Module *M = State.Builder.GetInsertBlock()->getModule();
       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
index 876d58131bd7a..16506b3a57573 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux)" --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux|goo)" --version 2
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=NEON
 ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=SVE_OR_NEON
 ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF
@@ -15,13 +15,13 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear8
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; SVE_OR_NEON:    [[TMP13:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_nomask_sve(ptr [[TMP12:%.*]])
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_nomask_sve(ptr [[TMP14:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear8
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; SVE_TF:    [[TMP19:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP18:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
+; SVE_TF:    [[TMP21:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP20:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -48,12 +48,12 @@ define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly %
 ;
 ; SVE_OR_NEON-LABEL: define void @test_vector_linear4
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_baz_vector_linear4_nomask_sve(<vscale x 4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP14:%.*]])
+; SVE_OR_NEON:    [[TMP17:%.*]] = call <vscale x 4 x i32> @vec_baz_vector_linear4_nomask_sve(<vscale x 4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_vector_linear4
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
+; SVE_TF:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -85,7 +85,7 @@ define void @test_linear8_bad_stride(ptr noalias %a, ptr readnone %b, i64 %n) {
 ;
 ; SVE_TF-LABEL: define void @test_linear8_bad_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
+; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -112,12 +112,12 @@ define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n)
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP14:%.*]] = call <vscale x 2 x i64> @vec_foo_linear16_nomask_sve(ptr [[TMP13:%.*]])
+; SVE_OR_NEON:    [[TMP16:%.*]] = call <vscale x 2 x i64> @vec_foo_linear16_nomask_sve(ptr [[TMP15:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4]]
 ;
 ; SVE_TF-LABEL: define void @test_linear16_wide_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR5]]
+; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]]
 ;
 entry:
   br label %for.body
@@ -145,13 +145,13 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP13:%.*]], ptr [[TMP14:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE_OR_NEON:    [[TMP17:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP15:%.*]], ptr [[TMP16:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear4_linear8
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[TMP21:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP19:%.*]], ptr [[TMP20:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_TF:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]]
+; SVE_TF:    [[TMP23:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP21:%.*]], ptr [[TMP22:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -179,12 +179,12 @@ define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) {
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP13:%.*]] = call <vscale x 4 x i32> @vec_bar_linear3_nomask_sve(i32 [[TMP12:%.*]])
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_bar_linear3_nomask_sve(i32 [[TMP14:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR6:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear3_non_ptr
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR7:[0-9]+]]
+; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -212,12 +212,12 @@ define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) {
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP13:%.*]] = call <vscale x 4 x i32> @vec_bar_linearn5_nomask_sve(i32 [[TMP12:%.*]])
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_bar_linearn5_nomask_sve(i32 [[TMP14:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR7:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linearn5_non_ptr_neg_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR8:[0-9]+]]
+; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -237,10 +237,44 @@ for.cond.cleanup:
   ret void
 }
 
+define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) {
+; NEON-LABEL: define void @test_linear8_return_void
+; NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) {
+; NEON:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]])
+; NEON:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]]
+;
+; SVE_OR_NEON-LABEL: define void @test_linear8_return_void
+; SVE_OR_NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON:    call void @vec_goo_linear8_nomask_sve(<vscale x 2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]])
+; SVE_OR_NEON:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR8:[0-9]+]]
+;
+; SVE_TF-LABEL: define void @test_linear8_return_void
+; SVE_TF-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gep.in = getelementptr i64, ptr %in, i64 %indvars.iv
+  %num = load i64, ptr %gep.in, align 8
+  %gep.out = getelementptr i64, ptr %out, i64 %indvars.iv
+  call void @goo(i64 %num, ptr %gep.out) #6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
 declare i64 @foo(ptr)
 declare i32 @baz(i32, ptr)
 declare i32 @quux(ptr, ptr)
 declare i32 @bar(i32)
+declare void @goo(i64, ptr)
 
 ; neon vector variants of foo
 declare <2 x i64> @vec_foo_linear8_nomask_neon(ptr)
@@ -249,6 +283,7 @@ declare <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32>, ptr)
 declare <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr, ptr)
 declare <4 x i32> @vec_bar_linear3_nomask_neon(i32)
 declare <4 x i32> @vec_bar_linearn5_nomask_neon(i32)
+declare void @vec_goo_linear8_nomask_neon(<2 x i64>, ptr)
 
 ; scalable vector variants of foo
 declare <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr, <vscale x 2 x i1>)
@@ -258,6 +293,8 @@ declare <vscale x 4 x i32> @vec_baz_vector_linear4_nomask_sve(<vscale x 4 x i32>
 declare <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr, ptr, <vscale x 4 x i1>)
 declare <vscale x 4 x i32> @vec_bar_linear3_nomask_sve(i32)
 declare <vscale x 4 x i32> @vec_bar_linearn5_nomask_sve(i32)
+declare void @vec_goo_linear8_nomask_sve(<vscale x 2 x i64>, ptr)
+declare void @vec_goo_linear8_mask_sve(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>)
 
 attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVsNxl8_foo(vec_foo_linear8_nomask_sve),_ZGVsMxl8_foo(vec_foo_linear8_mask_sve),_ZGVnN2l8_foo(vec_foo_linear8_nomask_neon)" }
 attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsNxvl4_baz(vec_baz_vector_linear4_nomask_sve),_ZGVnN4vl4_baz(vec_baz_vector_linear4_nomask_neon)" }
@@ -265,3 +302,4 @@ attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVsNxl16_foo(vec_foo
 attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxl4l8_quux(vec_quux_linear4_linear8_mask_sve),_ZGVnN4l4l8_quux(vec_quux_linear4_linear8_nomask_neon)" }
 attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVsNxl3_bar(vec_bar_linear3_nomask_sve),_ZGVnN4l3_bar(vec_bar_linear3_nomask_neon)" }
 attributes #5 = { nounwind "vector-function-abi-variant"="_ZGVsNxln5_bar(vec_bar_linearn5_nomask_sve),_ZGVnN4ln5_bar(vec_bar_linearn5_nomask_neon)" }
+attributes #6 = { nounwind "vector-function-abi-variant"="_ZGVsNxvl8_goo(vec_goo_linear8_nomask_sve),_ZGVsMxvl8_goo(vec_goo_linear8_mask_sve),_ZGVsN2vl8_goo(vec_goo_linear8_nomask_neon)" }

From eba2b789d3b91ae1cefebe112fec6c667b86a904 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Tue, 2 Jan 2024 10:23:29 -0800
Subject: [PATCH 061/313] [RawProfReader]When constructing symbol table, read
 the MD5 of function name in the proper byte order (#76312)

Before this patch, when the field `NameRef` is generated in little-endian systems and read back in big-endian systems, the information gets dropped.
- The bug gets caught by a buildbot
https://lab.llvm.org/buildbot/#/builders/94/builds/17931. In the error message (pasted below),
two indirect call targets are not imported.

```
   ; IMPORTS-DAG: Import _Z7callee1v
               ^
<stdin>:1:1: note: scanning from here
main.ll: Import _Z11global_funcv from lib.cc
^
<stdin>:1:10: note: possible intended match here
main.ll: Import _Z11global_funcv from lib.cc
         ^
Input file: <stdin>
Check file:
/home/uweigand/sandbox/buildbot/clang-s390x-linux/llvm/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll
-dump-input=help explains the following input dump.
Input was:
<<<<<<
          1: main.ll: Import _Z11global_funcv from lib.cc
dag:34'0 X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match
found
dag:34'1 ? possible intended match
```

[This commit](https://github.com/llvm/llvm-project/commit/b3999246b1a93bd8e6dc7acac36a7d57fac96542#diff-b196b796c5a396c7cdf93b347fe47e2b29b72d0b7dd0e2b88abb964d376ee50e) gates the fix by flag and provide test data by creating big-endian profiles (rather than reading the little-endian data on a big-endian system that might require a VM).

- [This](https://github.com/llvm/llvm-project/commit/b3999246b1a93bd8e6dc7acac36a7d57fac96542#diff-643176077ddbe537bd0a05d2a8a53bdff6339420a30e8511710bf232afdda8b9) is a hexdump of little-endian profile data, and [this](https://github.com/llvm/llvm-project/commit/b3999246b1a93bd8e6dc7acac36a7d57fac96542#diff-1736a3ee25dde02bba55d670df78988fdb227e5a85b94b8707cf182cf70b28f0) is the big-endian version of it.
- The [README.md](https://github.com/llvm/llvm-project/commit/b3999246b1a93bd8e6dc7acac36a7d57fac96542#diff-6717b6a385de3ae60ab3aec9638af2a43b55adaf6784b6f0393ebe1a6639438b) shows the result of `llvm-profdata show -ic-targets` before and after the fix when the profile is in big-endian.
---
 llvm/lib/ProfileData/InstrProfReader.cpp                       | 2 +-
 .../Transforms/PGOProfile/thinlto_indirect_call_promotion.ll   | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 8f62df79d5b7e..b547cf7181b16 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -539,7 +539,7 @@ Error RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
     const IntPtrT FPtr = swap(I->FunctionPointer);
     if (!FPtr)
       continue;
-    Symtab.mapAddress(FPtr, I->NameRef);
+    Symtab.mapAddress(FPtr, swap(I->NameRef));
   }
   return success();
 }
diff --git a/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll b/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll
index b24effed7024c..d2f4696ccf41d 100644
--- a/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll
+++ b/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll
@@ -9,9 +9,6 @@
 ; The raw profiles storesd compressed function names, so profile reader should
 ; be built with zlib support to decompress them.
 ; REQUIRES: zlib
-; REQUIRES: host-byteorder-little-endian
-; Raw profiles are generate on 64-bit systems.
-; REQUIRES: llvm-64-bits
 
 ; RUN: rm -rf %t && split-file %s %t && cd %t
 

From 0b3a89f121eaf23b93b9b9b2a9410ae1f6e2fc44 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Tue, 2 Jan 2024 19:23:56 +0100
Subject: [PATCH 062/313] [builtins] Avoid using long double in FreeBSD
 standalone environment (#76175)

After 05a4212cc76d a number of long double related declarations are
enabled in `int_types.h`, whenever the CPU architecture and platform
support it. However, this does not work with FreeBSD's standalone
environment, which disallows any use of floating point.

In add98b246290 this was made conditional with `CRT_HAS_FLOATING_POINT`,
so extend the block guarded by that define to include all floating point
related declarations.
---
 compiler-rt/lib/builtins/int_types.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index 18bf0a7f3bf99..7624c72806151 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -139,7 +139,6 @@ typedef union {
   udwords u;
   double f;
 } double_bits;
-#endif
 
 typedef struct {
 #if _YUGA_LITTLE_ENDIAN
@@ -220,7 +219,6 @@ typedef union {
 #define CRT_HAS_TF_MODE
 #endif
 
-#if CRT_HAS_FLOATING_POINT
 #if __STDC_VERSION__ >= 199901L
 typedef float _Complex Fcomplex;
 typedef double _Complex Dcomplex;
@@ -270,5 +268,5 @@ typedef struct {
 #define COMPLEXTF_IMAGINARY(x) (x).imaginary
 #endif
 
-#endif
+#endif // CRT_HAS_FLOATING_POINT
 #endif // INT_TYPES_H

From 4b7707bfd9b24e972212c6e40d91c1b21f6f65b6 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Tue, 2 Jan 2024 18:23:08 +0000
Subject: [PATCH 063/313] [clang-tidy][DOC] Generate documentation for new
 Clang-Analyzer checks

Some checks were recently moved from alpha stage.
Manualy running gen-static-analyzer-docs.py to
generate missing documentation for Clang-tidy.
---
 .../optin.core.EnumCastOutOfRange.rst              | 13 +++++++++++++
 .../security.cert.env.InvalidPtr.rst               | 13 +++++++++++++
 .../checks/clang-analyzer/unix.Errno.rst           | 13 +++++++++++++
 .../clang-analyzer/unix.StdCLibraryFunctions.rst   | 14 ++++++++++++++
 clang-tools-extra/docs/clang-tidy/checks/list.rst  |  4 ++++
 5 files changed, 57 insertions(+)
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.core.EnumCastOutOfRange.rst
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.cert.env.InvalidPtr.rst
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.Errno.rst
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.StdCLibraryFunctions.rst

diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.core.EnumCastOutOfRange.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.core.EnumCastOutOfRange.rst
new file mode 100644
index 0000000000000..99dd1adf38f6b
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.core.EnumCastOutOfRange.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - clang-analyzer-optin.core.EnumCastOutOfRange
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#optin-core-enumcastoutofrange
+
+clang-analyzer-optin.core.EnumCastOutOfRange
+============================================
+
+Check integer to enumeration casts for out of range values.
+
+The `clang-analyzer-optin.core.EnumCastOutOfRange` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#optin-core-enumcastoutofrange>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.cert.env.InvalidPtr.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.cert.env.InvalidPtr.rst
new file mode 100644
index 0000000000000..8986fa0e684f5
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.cert.env.InvalidPtr.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - clang-analyzer-security.cert.env.InvalidPtr
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr
+
+clang-analyzer-security.cert.env.InvalidPtr
+===========================================
+
+Finds usages of possibly invalidated pointers.
+
+The `clang-analyzer-security.cert.env.InvalidPtr` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.Errno.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.Errno.rst
new file mode 100644
index 0000000000000..67a2d05811dc1
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.Errno.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - clang-analyzer-unix.Errno
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno
+
+clang-analyzer-unix.Errno
+=========================
+
+Check for improper use of 'errno'.
+
+The `clang-analyzer-unix.Errno` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.StdCLibraryFunctions.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.StdCLibraryFunctions.rst
new file mode 100644
index 0000000000000..17906732c86f7
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.StdCLibraryFunctions.rst
@@ -0,0 +1,14 @@
+.. title:: clang-tidy - clang-analyzer-unix.StdCLibraryFunctions
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#unix-stdclibraryfunctions
+
+clang-analyzer-unix.StdCLibraryFunctions
+========================================
+
+Check for invalid arguments of C standard library functions, and apply relations
+between arguments and return value.
+
+The `clang-analyzer-unix.StdCLibraryFunctions` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#unix-stdclibraryfunctions>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 39d8b490d927c..b36bf7d497b9d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -440,6 +440,7 @@ Clang-Tidy Checks
    :doc:`clang-analyzer-nullability.NullableDereferenced <clang-analyzer/nullability.NullableDereferenced>`, `Clang Static Analyzer nullability.NullableDereferenced <https://clang.llvm.org/docs/analyzer/checkers.html#nullability-nullabledereferenced>`_,
    :doc:`clang-analyzer-nullability.NullablePassedToNonnull <clang-analyzer/nullability.NullablePassedToNonnull>`, `Clang Static Analyzer nullability.NullablePassedToNonnull <https://clang.llvm.org/docs/analyzer/checkers.html#nullability-nullablepassedtononnull>`_,
    :doc:`clang-analyzer-nullability.NullableReturnedFromNonnull <clang-analyzer/nullability.NullableReturnedFromNonnull>`, `Clang Static Analyzer nullability.NullableReturnedFromNonnull <https://clang.llvm.org/docs/analyzer/checkers.html#nullability-nullablereturnedfromnonnull>`_,
+   :doc:`clang-analyzer-optin.core.EnumCastOutOfRange <clang-analyzer/optin.core.EnumCastOutOfRange>`, `Clang Static Analyzer optin.core.EnumCastOutOfRange <https://clang.llvm.org/docs/analyzer/checkers.html#optin-core-enumcastoutofrange>`_,
    :doc:`clang-analyzer-optin.cplusplus.UninitializedObject <clang-analyzer/optin.cplusplus.UninitializedObject>`, `Clang Static Analyzer optin.cplusplus.UninitializedObject <https://clang.llvm.org/docs/analyzer/checkers.html#optin-cplusplus-uninitializedobject>`_,
    :doc:`clang-analyzer-optin.cplusplus.VirtualCall <clang-analyzer/optin.cplusplus.VirtualCall>`, `Clang Static Analyzer optin.cplusplus.VirtualCall <https://clang.llvm.org/docs/analyzer/checkers.html#optin-cplusplus-virtualcall>`_,
    :doc:`clang-analyzer-optin.mpi.MPI-Checker <clang-analyzer/optin.mpi.MPI-Checker>`, `Clang Static Analyzer optin.mpi.MPI-Checker <https://clang.llvm.org/docs/analyzer/checkers.html#optin-mpi-mpi-checker>`_,
@@ -479,6 +480,7 @@ Clang-Tidy Checks
    :doc:`clang-analyzer-osx.coreFoundation.containers.OutOfBounds <clang-analyzer/osx.coreFoundation.containers.OutOfBounds>`, `Clang Static Analyzer osx.coreFoundation.containers.OutOfBounds <https://clang.llvm.org/docs/analyzer/checkers.html#osx-corefoundation-containers-outofbounds>`_,
    :doc:`clang-analyzer-osx.coreFoundation.containers.PointerSizedValues <clang-analyzer/osx.coreFoundation.containers.PointerSizedValues>`, `Clang Static Analyzer osx.coreFoundation.containers.PointerSizedValues <https://clang.llvm.org/docs/analyzer/checkers.html#osx-corefoundation-containers-pointersizedvalues>`_,
    :doc:`clang-analyzer-security.FloatLoopCounter <clang-analyzer/security.FloatLoopCounter>`, `Clang Static Analyzer security.FloatLoopCounter <https://clang.llvm.org/docs/analyzer/checkers.html#security-floatloopcounter>`_,
+   :doc:`clang-analyzer-security.cert.env.InvalidPtr <clang-analyzer/security.cert.env.InvalidPtr>`, `Clang Static Analyzer security.cert.env.InvalidPtr <https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`_,
    :doc:`clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling <clang-analyzer/security.insecureAPI.DeprecatedOrUnsafeBufferHandling>`, `Clang Static Analyzer security.insecureAPI.DeprecatedOrUnsafeBufferHandling <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-deprecatedorunsafebufferhandling>`_,
    :doc:`clang-analyzer-security.insecureAPI.UncheckedReturn <clang-analyzer/security.insecureAPI.UncheckedReturn>`, `Clang Static Analyzer security.insecureAPI.UncheckedReturn <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-uncheckedreturn>`_,
    :doc:`clang-analyzer-security.insecureAPI.bcmp <clang-analyzer/security.insecureAPI.bcmp>`, `Clang Static Analyzer security.insecureAPI.bcmp <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-bcmp>`_,
@@ -493,9 +495,11 @@ Clang-Tidy Checks
    :doc:`clang-analyzer-security.insecureAPI.strcpy <clang-analyzer/security.insecureAPI.strcpy>`, `Clang Static Analyzer security.insecureAPI.strcpy <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-strcpy>`_,
    :doc:`clang-analyzer-security.insecureAPI.vfork <clang-analyzer/security.insecureAPI.vfork>`, `Clang Static Analyzer security.insecureAPI.vfork <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-vfork>`_,
    :doc:`clang-analyzer-unix.API <clang-analyzer/unix.API>`, `Clang Static Analyzer unix.API <https://clang.llvm.org/docs/analyzer/checkers.html#unix-api>`_,
+   :doc:`clang-analyzer-unix.Errno <clang-analyzer/unix.Errno>`, `Clang Static Analyzer unix.Errno <https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno>`_,
    :doc:`clang-analyzer-unix.Malloc <clang-analyzer/unix.Malloc>`, `Clang Static Analyzer unix.Malloc <https://clang.llvm.org/docs/analyzer/checkers.html#unix-malloc>`_,
    :doc:`clang-analyzer-unix.MallocSizeof <clang-analyzer/unix.MallocSizeof>`, `Clang Static Analyzer unix.MallocSizeof <https://clang.llvm.org/docs/analyzer/checkers.html#unix-mallocsizeof>`_,
    :doc:`clang-analyzer-unix.MismatchedDeallocator <clang-analyzer/unix.MismatchedDeallocator>`, `Clang Static Analyzer unix.MismatchedDeallocator <https://clang.llvm.org/docs/analyzer/checkers.html#unix-mismatcheddeallocator>`_,
+   :doc:`clang-analyzer-unix.StdCLibraryFunctions <clang-analyzer/unix.StdCLibraryFunctions>`, `Clang Static Analyzer unix.StdCLibraryFunctions <https://clang.llvm.org/docs/analyzer/checkers.html#unix-stdclibraryfunctions>`_,
    :doc:`clang-analyzer-unix.Vfork <clang-analyzer/unix.Vfork>`, `Clang Static Analyzer unix.Vfork <https://clang.llvm.org/docs/analyzer/checkers.html#unix-vfork>`_,
    :doc:`clang-analyzer-unix.cstring.BadSizeArg <clang-analyzer/unix.cstring.BadSizeArg>`, `Clang Static Analyzer unix.cstring.BadSizeArg <https://clang.llvm.org/docs/analyzer/checkers.html#unix-cstring-badsizearg>`_,
    :doc:`clang-analyzer-unix.cstring.NullArg <clang-analyzer/unix.cstring.NullArg>`, `Clang Static Analyzer unix.cstring.NullArg <https://clang.llvm.org/docs/analyzer/checkers.html#unix-cstring-nullarg>`_,

From 0d19a8983c05de321d8ab592995e7a36bca448ee Mon Sep 17 00:00:00 2001
From: walter erquinigo <walter@modular.com>
Date: Tue, 2 Jan 2024 13:30:05 -0500
Subject: [PATCH 064/313] Fix builtbot

https://lab.llvm.org/buildbot/#/builders/96/builds/50702/steps/6/logs/stdio requires checking for multiple error messages
---
 .../test/API/tools/lldb-dap/optimized/TestDAP_optimized.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
index 418a6225febd0..90b130d3af4d5 100644
--- a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
+++ b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
@@ -47,7 +47,8 @@ def test_optimized_variable(self):
         optimized_variable = self.dap_server.get_local_variable("argc")
 
         self.assertTrue(optimized_variable["value"].startswith("<error:"))
-        self.assertEqual(
-            optimized_variable["$__lldb_extensions"]["error"],
-            "Could not evaluate DW_OP_entry_value.",
+        error_msg = optimized_variable["$__lldb_extensions"]["error"]
+        self.assertTrue(
+            ("Could not evaluate DW_OP_entry_value" in error_msg)
+            or ("variable not available" in error_msg)
         )

From a2d7af757bc33dc91f2e038742915a146cfb0c13 Mon Sep 17 00:00:00 2001
From: Katherine Rasmussen <krasmussen@lbl.gov>
Date: Tue, 2 Jan 2024 10:40:47 -0800
Subject: [PATCH 065/313] [flang] Add notify-type and notify-wait-stmt (#76594)

Add `notify-type` to `iso_fortran_env` module. Add `notify-wait-stmt` to
the parser and add checks for constraints on the statement, `C1177` and
`C1178`, from the Fortran 2023 standard. Add three semantics tests for
`notify-wait-stmt`.
---
 flang/examples/FeatureList/FeatureList.cpp   |   3 +-
 flang/include/flang/Evaluate/tools.h         |   1 +
 flang/include/flang/Lower/PFTBuilder.h       |  15 +--
 flang/include/flang/Lower/Runtime.h          |   3 +
 flang/include/flang/Parser/dump-parse-tree.h |   3 +-
 flang/include/flang/Parser/parse-tree.h      |  30 +++--
 flang/lib/Evaluate/tools.cpp                 |   4 +
 flang/lib/Lower/Bridge.cpp                   |   4 +
 flang/lib/Lower/Runtime.cpp                  |   6 +
 flang/lib/Parser/executable-parsers.cpp      |  21 ++--
 flang/lib/Parser/unparse.cpp                 |   9 +-
 flang/lib/Semantics/check-coarray.cpp        |  77 ++++++++----
 flang/lib/Semantics/check-coarray.h          |   2 +
 flang/module/__fortran_builtins.f90          |   4 +
 flang/module/iso_fortran_env.f90             |   1 +
 flang/test/Semantics/notifywait01.f90        |  26 ++++
 flang/test/Semantics/notifywait02.f90        |  74 +++++++++++
 flang/test/Semantics/notifywait03.f90        | 123 +++++++++++++++++++
 18 files changed, 352 insertions(+), 54 deletions(-)
 create mode 100644 flang/test/Semantics/notifywait01.f90
 create mode 100644 flang/test/Semantics/notifywait02.f90
 create mode 100644 flang/test/Semantics/notifywait03.f90

diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index 6f10553cdcb4c..2338fa1b14a37 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -281,7 +281,7 @@ struct NodeVisitor {
   READ_FEATURE(ErrorRecovery)
   READ_FEATURE(EventPostStmt)
   READ_FEATURE(EventWaitStmt)
-  READ_FEATURE(EventWaitStmt::EventWaitSpec)
+  READ_FEATURE(EventWaitSpec)
   READ_FEATURE(ExecutableConstruct)
   READ_FEATURE(ExecutionPart)
   READ_FEATURE(ExecutionPartConstruct)
@@ -438,6 +438,7 @@ struct NodeVisitor {
   READ_FEATURE(NamelistStmt::Group)
   READ_FEATURE(NonLabelDoStmt)
   READ_FEATURE(NoPass)
+  READ_FEATURE(NotifyWaitStmt)
   READ_FEATURE(NullifyStmt)
   READ_FEATURE(NullInit)
   READ_FEATURE(ObjectDecl)
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 51414d61785f0..c0cbb05c009d6 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1232,6 +1232,7 @@ bool IsBuiltinDerivedType(const DerivedTypeSpec *derived, const char *name);
 bool IsBuiltinCPtr(const Symbol &);
 bool IsEventType(const DerivedTypeSpec *);
 bool IsLockType(const DerivedTypeSpec *);
+bool IsNotifyType(const DerivedTypeSpec *);
 // Is this derived type TEAM_TYPE from module ISO_FORTRAN_ENV?
 bool IsTeamType(const DerivedTypeSpec *);
 // Is this derived type TEAM_TYPE, C_PTR, or C_FUNPTR?
diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index 9c6696ff79dae..8d32c32352916 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -100,13 +100,14 @@ using ActionStmts = std::tuple<
     parser::EventPostStmt, parser::EventWaitStmt, parser::ExitStmt,
     parser::FailImageStmt, parser::FlushStmt, parser::FormTeamStmt,
     parser::GotoStmt, parser::IfStmt, parser::InquireStmt, parser::LockStmt,
-    parser::NullifyStmt, parser::OpenStmt, parser::PointerAssignmentStmt,
-    parser::PrintStmt, parser::ReadStmt, parser::ReturnStmt, parser::RewindStmt,
-    parser::StopStmt, parser::SyncAllStmt, parser::SyncImagesStmt,
-    parser::SyncMemoryStmt, parser::SyncTeamStmt, parser::UnlockStmt,
-    parser::WaitStmt, parser::WhereStmt, parser::WriteStmt,
-    parser::ComputedGotoStmt, parser::ForallStmt, parser::ArithmeticIfStmt,
-    parser::AssignStmt, parser::AssignedGotoStmt, parser::PauseStmt>;
+    parser::NotifyWaitStmt, parser::NullifyStmt, parser::OpenStmt,
+    parser::PointerAssignmentStmt, parser::PrintStmt, parser::ReadStmt,
+    parser::ReturnStmt, parser::RewindStmt, parser::StopStmt,
+    parser::SyncAllStmt, parser::SyncImagesStmt, parser::SyncMemoryStmt,
+    parser::SyncTeamStmt, parser::UnlockStmt, parser::WaitStmt,
+    parser::WhereStmt, parser::WriteStmt, parser::ComputedGotoStmt,
+    parser::ForallStmt, parser::ArithmeticIfStmt, parser::AssignStmt,
+    parser::AssignedGotoStmt, parser::PauseStmt>;
 
 using OtherStmts = std::tuple<parser::EntryStmt, parser::FormatStmt>;
 
diff --git a/flang/include/flang/Lower/Runtime.h b/flang/include/flang/Lower/Runtime.h
index e71496edad9ba..77e98a1e019e7 100644
--- a/flang/include/flang/Lower/Runtime.h
+++ b/flang/include/flang/Lower/Runtime.h
@@ -34,6 +34,7 @@ namespace parser {
 struct EventPostStmt;
 struct EventWaitStmt;
 struct LockStmt;
+struct NotifyWaitStmt;
 struct PauseStmt;
 struct StopStmt;
 struct SyncAllStmt;
@@ -49,6 +50,8 @@ class AbstractConverter;
 
 // Lowering of Fortran statement related runtime (other than IO and maths)
 
+void genNotifyWaitStatement(AbstractConverter &,
+                            const parser::NotifyWaitStmt &);
 void genEventPostStatement(AbstractConverter &, const parser::EventPostStmt &);
 void genEventWaitStatement(AbstractConverter &, const parser::EventWaitStmt &);
 void genLockStatement(AbstractConverter &, const parser::LockStmt &);
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 7c479a2334ea5..1defbf132327c 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -301,8 +301,8 @@ class ParseTreeDumper {
   NODE(parser, ErrLabel)
   NODE(parser, ErrorRecovery)
   NODE(parser, EventPostStmt)
+  NODE(parser, EventWaitSpec)
   NODE(parser, EventWaitStmt)
-  NODE(EventWaitStmt, EventWaitSpec)
   NODE(parser, ExecutableConstruct)
   NODE(parser, ExecutionPart)
   NODE(parser, ExecutionPartConstruct)
@@ -462,6 +462,7 @@ class ParseTreeDumper {
   NODE(NamelistStmt, Group)
   NODE(parser, NonLabelDoStmt)
   NODE(parser, NoPass)
+  NODE(parser, NotifyWaitStmt)
   NODE(parser, NullifyStmt)
   NODE(parser, NullInit)
   NODE(parser, ObjectDecl)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 393e0e24ec5cb..71195f2bb9ddc 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -209,11 +209,13 @@ struct ExitStmt; // R1156
 struct GotoStmt; // R1157
 struct ComputedGotoStmt; // R1158
 struct StopStmt; // R1160, R1161
+struct NotifyWaitStmt; // F2023: R1166
 struct SyncAllStmt; // R1164
 struct SyncImagesStmt; // R1166
 struct SyncMemoryStmt; // R1168
 struct SyncTeamStmt; // R1169
 struct EventPostStmt; // R1170, R1171
+struct EventWaitSpec; // F2023: R1177
 struct EventWaitStmt; // R1172, R1173, R1174
 struct FormTeamStmt; // R1175, R1176, R1177
 struct LockStmt; // R1178
@@ -477,9 +479,9 @@ EMPTY_CLASS(FailImageStmt);
 //        close-stmt | continue-stmt | cycle-stmt | deallocate-stmt |
 //        endfile-stmt | error-stop-stmt | event-post-stmt | event-wait-stmt |
 //        exit-stmt | fail-image-stmt | flush-stmt | form-team-stmt |
-//        goto-stmt | if-stmt | inquire-stmt | lock-stmt | nullify-stmt |
-//        open-stmt | pointer-assignment-stmt | print-stmt | read-stmt |
-//        return-stmt | rewind-stmt | stop-stmt | sync-all-stmt |
+//        goto-stmt | if-stmt | inquire-stmt | lock-stmt | notify-wait-stmt |
+//        nullify-stmt | open-stmt | pointer-assignment-stmt | print-stmt |
+//        read-stmt | return-stmt | rewind-stmt | stop-stmt | sync-all-stmt |
 //        sync-images-stmt | sync-memory-stmt | sync-team-stmt | unlock-stmt |
 //        wait-stmt | where-stmt | write-stmt | computed-goto-stmt | forall-stmt
 struct ActionStmt {
@@ -494,8 +496,8 @@ struct ActionStmt {
       common::Indirection<FlushStmt>, common::Indirection<FormTeamStmt>,
       common::Indirection<GotoStmt>, common::Indirection<IfStmt>,
       common::Indirection<InquireStmt>, common::Indirection<LockStmt>,
-      common::Indirection<NullifyStmt>, common::Indirection<OpenStmt>,
-      common::Indirection<PointerAssignmentStmt>,
+      common::Indirection<NotifyWaitStmt>, common::Indirection<NullifyStmt>,
+      common::Indirection<OpenStmt>, common::Indirection<PointerAssignmentStmt>,
       common::Indirection<PrintStmt>, common::Indirection<ReadStmt>,
       common::Indirection<ReturnStmt>, common::Indirection<RewindStmt>,
       common::Indirection<StopStmt>, common::Indirection<SyncAllStmt>,
@@ -2492,6 +2494,13 @@ struct StopStmt {
   std::tuple<Kind, std::optional<StopCode>, std::optional<ScalarLogicalExpr>> t;
 };
 
+// F2023: R1166 notify-wait-stmt -> NOTIFY WAIT ( notify-variable [,
+// event-wait-spec-list] )
+struct NotifyWaitStmt {
+  TUPLE_CLASS_BOILERPLATE(NotifyWaitStmt);
+  std::tuple<Scalar<Variable>, std::list<EventWaitSpec>> t;
+};
+
 // R1164 sync-all-stmt -> SYNC ALL [( [sync-stat-list] )]
 WRAPPER_CLASS(SyncAllStmt, std::list<StatOrErrmsg>);
 
@@ -2524,15 +2533,16 @@ struct EventPostStmt {
   std::tuple<EventVariable, std::list<StatOrErrmsg>> t;
 };
 
+// R1173 event-wait-spec -> until-spec | sync-stat
+struct EventWaitSpec {
+  UNION_CLASS_BOILERPLATE(EventWaitSpec);
+  std::variant<ScalarIntExpr, StatOrErrmsg> u;
+};
+
 // R1172 event-wait-stmt ->
 //         EVENT WAIT ( event-variable [, event-wait-spec-list] )
-// R1173 event-wait-spec -> until-spec | sync-stat
 // R1174 until-spec -> UNTIL_COUNT = scalar-int-expr
 struct EventWaitStmt {
-  struct EventWaitSpec {
-    UNION_CLASS_BOILERPLATE(EventWaitSpec);
-    std::variant<ScalarIntExpr, StatOrErrmsg> u;
-  };
   TUPLE_CLASS_BOILERPLATE(EventWaitStmt);
   std::tuple<EventVariable, std::list<EventWaitSpec>> t;
 };
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 44a6fa4333cf3..7834364bccc40 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1765,6 +1765,10 @@ bool IsLockType(const DerivedTypeSpec *derived) {
   return IsBuiltinDerivedType(derived, "lock_type");
 }
 
+bool IsNotifyType(const DerivedTypeSpec *derived) {
+  return IsBuiltinDerivedType(derived, "notify_type");
+}
+
 bool IsTeamType(const DerivedTypeSpec *derived) {
   return IsBuiltinDerivedType(derived, "team_type");
 }
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index e1d406e3cf319..2bceee09b4f0f 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3092,6 +3092,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
   //===--------------------------------------------------------------------===//
 
+  void genFIR(const Fortran::parser::NotifyWaitStmt &stmt) {
+    genNotifyWaitStatement(*this, stmt);
+  }
+
   void genFIR(const Fortran::parser::EventPostStmt &stmt) {
     genEventPostStatement(*this, stmt);
   }
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index 8855cab8b5174..e7695929623f6 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -137,6 +137,12 @@ void Fortran::lower::genFailImageStatement(
   genUnreachable(builder, loc);
 }
 
+void Fortran::lower::genNotifyWaitStatement(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::parser::NotifyWaitStmt &) {
+  TODO(converter.getCurrentLocation(), "coarray: NOTIFY WAIT runtime");
+}
+
 void Fortran::lower::genEventPostStatement(
     Fortran::lower::AbstractConverter &converter,
     const Fortran::parser::EventPostStmt &) {
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index 892c612d0c4dc..de2be017508c3 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -92,9 +92,9 @@ TYPE_CONTEXT_PARSER("execution part"_en_US,
 //        close-stmt | continue-stmt | cycle-stmt | deallocate-stmt |
 //        endfile-stmt | error-stop-stmt | event-post-stmt | event-wait-stmt |
 //        exit-stmt | fail-image-stmt | flush-stmt | form-team-stmt |
-//        goto-stmt | if-stmt | inquire-stmt | lock-stmt | nullify-stmt |
-//        open-stmt | pointer-assignment-stmt | print-stmt | read-stmt |
-//        return-stmt | rewind-stmt | stop-stmt | sync-all-stmt |
+//        goto-stmt | if-stmt | inquire-stmt | lock-stmt | notify-wait-stmt |
+//        nullify-stmt | open-stmt | pointer-assignment-stmt | print-stmt |
+//        read-stmt | return-stmt | rewind-stmt | stop-stmt | sync-all-stmt |
 //        sync-images-stmt | sync-memory-stmt | sync-team-stmt | unlock-stmt |
 //        wait-stmt | where-stmt | write-stmt | computed-goto-stmt | forall-stmt
 // R1159 continue-stmt -> CONTINUE
@@ -119,6 +119,7 @@ TYPE_PARSER(first(construct<ActionStmt>(indirect(Parser<AllocateStmt>{})),
     construct<ActionStmt>(indirect(Parser<IfStmt>{})),
     construct<ActionStmt>(indirect(Parser<InquireStmt>{})),
     construct<ActionStmt>(indirect(Parser<LockStmt>{})),
+    construct<ActionStmt>(indirect(Parser<NotifyWaitStmt>{})),
     construct<ActionStmt>(indirect(Parser<NullifyStmt>{})),
     construct<ActionStmt>(indirect(Parser<OpenStmt>{})),
     construct<ActionStmt>(indirect(Parser<PrintStmt>{})),
@@ -453,6 +454,13 @@ TYPE_CONTEXT_PARSER("STOP statement"_en_US,
 // parse time.
 TYPE_PARSER(construct<StopCode>(scalar(expr)))
 
+// F2030: R1166 notify-wait-stmt ->
+//         NOTIFY WAIT ( notify-variable [, event-wait-spec-list] )
+TYPE_CONTEXT_PARSER("NOTIFY WAIT statement"_en_US,
+    construct<NotifyWaitStmt>(
+        "NOTIFY WAIT"_sptok >> "("_tok >> scalar(variable),
+        defaulted("," >> nonemptyList(Parser<EventWaitSpec>{})) / ")"))
+
 // R1164 sync-all-stmt -> SYNC ALL [( [sync-stat-list] )]
 TYPE_CONTEXT_PARSER("SYNC ALL statement"_en_US,
     construct<SyncAllStmt>("SYNC ALL"_sptok >>
@@ -486,15 +494,14 @@ TYPE_CONTEXT_PARSER("EVENT POST statement"_en_US,
 //         EVENT WAIT ( event-variable [, event-wait-spec-list] )
 TYPE_CONTEXT_PARSER("EVENT WAIT statement"_en_US,
     construct<EventWaitStmt>("EVENT WAIT"_sptok >> "("_tok >> scalar(variable),
-        defaulted("," >> nonemptyList(Parser<EventWaitStmt::EventWaitSpec>{})) /
-            ")"))
+        defaulted("," >> nonemptyList(Parser<EventWaitSpec>{})) / ")"))
 
 // R1174 until-spec -> UNTIL_COUNT = scalar-int-expr
 constexpr auto untilSpec{"UNTIL_COUNT =" >> scalarIntExpr};
 
 // R1173 event-wait-spec -> until-spec | sync-stat
-TYPE_PARSER(construct<EventWaitStmt::EventWaitSpec>(untilSpec) ||
-    construct<EventWaitStmt::EventWaitSpec>(statOrErrmsg))
+TYPE_PARSER(construct<EventWaitSpec>(untilSpec) ||
+    construct<EventWaitSpec>(statOrErrmsg))
 
 // R1177 team-variable -> scalar-variable
 constexpr auto teamVariable{scalar(variable)};
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 6d9d176216325..1df49a688a12a 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -1150,6 +1150,11 @@ class UnparseVisitor {
   void Unparse(const FailImageStmt &) { // R1163
     Word("FAIL IMAGE");
   }
+  void Unparse(const NotifyWaitStmt &x) { // F2023: R1166
+    Word("NOTIFY WAIT ("), Walk(std::get<Scalar<Variable>>(x.t));
+    Walk(", ", std::get<std::list<EventWaitSpec>>(x.t), ", ");
+    Put(')');
+  }
   void Unparse(const SyncAllStmt &x) { // R1164
     Word("SYNC ALL ("), Walk(x.v, ", "), Put(')');
   }
@@ -1169,7 +1174,7 @@ class UnparseVisitor {
     Word("EVENT POST ("), Walk(std::get<EventVariable>(x.t));
     Walk(", ", std::get<std::list<StatOrErrmsg>>(x.t), ", "), Put(')');
   }
-  void Before(const EventWaitStmt::EventWaitSpec &x) { // R1173, R1174
+  void Before(const EventWaitSpec &x) { // R1173, R1174
     common::visit(common::visitors{
                       [&](const ScalarIntExpr &) { Word("UNTIL_COUNT="); },
                       [](const StatOrErrmsg &) {},
@@ -1178,7 +1183,7 @@ class UnparseVisitor {
   }
   void Unparse(const EventWaitStmt &x) { // R1170
     Word("EVENT WAIT ("), Walk(std::get<EventVariable>(x.t));
-    Walk(", ", std::get<std::list<EventWaitStmt::EventWaitSpec>>(x.t), ", ");
+    Walk(", ", std::get<std::list<EventWaitSpec>>(x.t), ", ");
     Put(')');
   }
   void Unparse(const FormTeamStmt &x) { // R1175, R1177
diff --git a/flang/lib/Semantics/check-coarray.cpp b/flang/lib/Semantics/check-coarray.cpp
index 77b198284e050..106af7960fa94 100644
--- a/flang/lib/Semantics/check-coarray.cpp
+++ b/flang/lib/Semantics/check-coarray.cpp
@@ -177,32 +177,15 @@ void CoarrayChecker::Leave(const parser::SyncTeamStmt &x) {
   CheckSyncStatList(context_, std::get<std::list<parser::StatOrErrmsg>>(x.t));
 }
 
-void CoarrayChecker::Leave(const parser::EventPostStmt &x) {
-  CheckSyncStatList(context_, std::get<std::list<parser::StatOrErrmsg>>(x.t));
-  CheckEventVariable(context_, std::get<parser::EventVariable>(x.t));
-}
-
-void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
-  const auto &eventVar{std::get<parser::EventVariable>(x.t)};
-
-  if (const auto *expr{GetExpr(context_, eventVar)}) {
-    if (ExtractCoarrayRef(expr)) {
-      context_.Say(parser::FindSourceLocation(eventVar), // C1177
-          "A event-variable in a EVENT WAIT statement may not be a coindexed object"_err_en_US);
-    } else {
-      CheckEventVariable(context_, eventVar);
-    }
-  }
-
+static void CheckEventWaitSpecList(SemanticsContext &context,
+    const std::list<parser::EventWaitSpec> &eventWaitSpecList) {
   bool gotStat{false}, gotMsg{false}, gotUntil{false};
-  using EventWaitSpec = parser::EventWaitStmt::EventWaitSpec;
-  for (const EventWaitSpec &eventWaitSpec :
-      std::get<std::list<EventWaitSpec>>(x.t)) {
+  for (const parser::EventWaitSpec &eventWaitSpec : eventWaitSpecList) {
     common::visit(
         common::visitors{
             [&](const parser::ScalarIntExpr &untilCount) {
               if (gotUntil) {
-                context_.Say( // C1178
+                context.Say( // C1178
                     "Until-spec in a event-wait-spec-list may not be repeated"_err_en_US);
               }
               gotUntil = true;
@@ -212,17 +195,17 @@ void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
                   common::visitors{
                       [&](const parser::StatVariable &stat) {
                         if (gotStat) {
-                          context_.Say( // C1178
+                          context.Say( // C1178
                               "A stat-variable in a event-wait-spec-list may not be repeated"_err_en_US);
                         }
                         gotStat = true;
                       },
                       [&](const parser::MsgVariable &var) {
-                        WarnOnDeferredLengthCharacterScalar(context_,
-                            GetExpr(context_, var),
+                        WarnOnDeferredLengthCharacterScalar(context,
+                            GetExpr(context, var),
                             var.v.thing.thing.GetSource(), "ERRMSG=");
                         if (gotMsg) {
-                          context_.Say( // C1178
+                          context.Say( // C1178
                               "A errmsg-variable in a event-wait-spec-list may not be repeated"_err_en_US);
                         }
                         gotMsg = true;
@@ -230,7 +213,7 @@ void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
                   },
                   statOrErrmsg.u);
               CheckCoindexedStatOrErrmsg(
-                  context_, statOrErrmsg, "event-wait-spec-list");
+                  context, statOrErrmsg, "event-wait-spec-list");
             },
 
         },
@@ -238,6 +221,48 @@ void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
   }
 }
 
+void CoarrayChecker::Leave(const parser::NotifyWaitStmt &x) {
+  const auto &notifyVar{std::get<parser::Scalar<parser::Variable>>(x.t)};
+
+  if (const auto *expr{GetExpr(context_, notifyVar)}) {
+    if (ExtractCoarrayRef(expr)) {
+      context_.Say(parser::FindSourceLocation(notifyVar), // F2023 - C1178
+          "A notify-variable in a NOTIFY WAIT statement may not be a coindexed object"_err_en_US);
+    } else if (!IsNotifyType(evaluate::GetDerivedTypeSpec(
+                   expr->GetType()))) { // F2023 - C1177
+      context_.Say(parser::FindSourceLocation(notifyVar),
+          "The notify-variable must be of type NOTIFY_TYPE from module ISO_FORTRAN_ENV"_err_en_US);
+    } else if (!evaluate::IsCoarray(*expr)) { // F2023 - C1612
+      context_.Say(parser::FindSourceLocation(notifyVar),
+          "The notify-variable must be a coarray"_err_en_US);
+    }
+  }
+
+  CheckEventWaitSpecList(
+      context_, std::get<std::list<parser::EventWaitSpec>>(x.t));
+}
+
+void CoarrayChecker::Leave(const parser::EventPostStmt &x) {
+  CheckSyncStatList(context_, std::get<std::list<parser::StatOrErrmsg>>(x.t));
+  CheckEventVariable(context_, std::get<parser::EventVariable>(x.t));
+}
+
+void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
+  const auto &eventVar{std::get<parser::EventVariable>(x.t)};
+
+  if (const auto *expr{GetExpr(context_, eventVar)}) {
+    if (ExtractCoarrayRef(expr)) {
+      context_.Say(parser::FindSourceLocation(eventVar), // C1177
+          "A event-variable in a EVENT WAIT statement may not be a coindexed object"_err_en_US);
+    } else {
+      CheckEventVariable(context_, eventVar);
+    }
+  }
+
+  CheckEventWaitSpecList(
+      context_, std::get<std::list<parser::EventWaitSpec>>(x.t));
+}
+
 void CoarrayChecker::Leave(const parser::UnlockStmt &x) {
   CheckSyncStatList(context_, std::get<std::list<parser::StatOrErrmsg>>(x.t));
 }
diff --git a/flang/lib/Semantics/check-coarray.h b/flang/lib/Semantics/check-coarray.h
index 251ee980d8a52..0af9a880fd31a 100644
--- a/flang/lib/Semantics/check-coarray.h
+++ b/flang/lib/Semantics/check-coarray.h
@@ -23,6 +23,7 @@ struct EventPostStmt;
 struct EventWaitStmt;
 struct FormTeamStmt;
 struct ImageSelector;
+struct NotifyWaitStmt;
 struct SyncAllStmt;
 struct SyncImagesStmt;
 struct SyncMemoryStmt;
@@ -41,6 +42,7 @@ class CoarrayChecker : public virtual BaseChecker {
   void Leave(const parser::SyncImagesStmt &);
   void Leave(const parser::SyncMemoryStmt &);
   void Leave(const parser::SyncTeamStmt &);
+  void Leave(const parser::NotifyWaitStmt &);
   void Leave(const parser::EventPostStmt &);
   void Leave(const parser::EventWaitStmt &);
   void Leave(const parser::UnlockStmt &);
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index 0bc66def847ed..0566ae6327d76 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -32,6 +32,10 @@
     integer(kind=int64), private :: __count
   end type
 
+  type :: __builtin_notify_type
+    integer(kind=int64), private :: __count
+  end type
+
   type :: __builtin_lock_type
     integer(kind=int64), private :: __count
   end type
diff --git a/flang/module/iso_fortran_env.f90 b/flang/module/iso_fortran_env.f90
index 6ee153592e1c6..cd3c06f8c7566 100644
--- a/flang/module/iso_fortran_env.f90
+++ b/flang/module/iso_fortran_env.f90
@@ -15,6 +15,7 @@ module iso_fortran_env
 
   use __fortran_builtins, only: &
     event_type => __builtin_event_type, &
+    notify_type => __builtin_notify_type, &
     lock_type => __builtin_lock_type, &
     team_type => __builtin_team_type, &
     atomic_int_kind => __builtin_atomic_int_kind, &
diff --git a/flang/test/Semantics/notifywait01.f90 b/flang/test/Semantics/notifywait01.f90
new file mode 100644
index 0000000000000..83a58ba792881
--- /dev/null
+++ b/flang/test/Semantics/notifywait01.f90
@@ -0,0 +1,26 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! This test checks the acceptance of standard-conforming notify-wait-stmts based
+! on the statement specification in section 11.6 of the Fortran 2023 standard.
+
+program test_notify_wait
+  use iso_fortran_env, only: notify_type
+  implicit none
+
+  type(notify_type) :: notify_var[*]
+  integer :: count, count_array(1), sync_status, coindexed_integer[*]
+  character(len=128) :: error_message
+
+  !_______________________ standard-conforming statements ___________________________
+
+  notify wait(notify_var)
+  notify wait(notify_var, until_count=count)
+  notify wait(notify_var, until_count=count_array(1))
+  notify wait(notify_var, until_count=coindexed_integer[1])
+  notify wait(notify_var, stat=sync_status)
+  notify wait(notify_var, until_count=count, stat=sync_status)
+  notify wait(notify_var, errmsg=error_message)
+  notify wait(notify_var, until_count=count, errmsg=error_message)
+  notify wait(notify_var, stat=sync_status, errmsg=error_message)
+  notify wait(notify_var, until_count=count, stat=sync_status, errmsg=error_message)
+
+end program test_notify_wait
diff --git a/flang/test/Semantics/notifywait02.f90 b/flang/test/Semantics/notifywait02.f90
new file mode 100644
index 0000000000000..eebf3d05edaf6
--- /dev/null
+++ b/flang/test/Semantics/notifywait02.f90
@@ -0,0 +1,74 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! This test checks for semantic errors in notify wait statements based on the
+! statement specification in section 11.6 of the Fortran 2023 standard
+
+program test_notify_wait
+  use iso_fortran_env, only: notify_type
+  implicit none
+
+  ! notify_type variables must be coarrays
+  type(notify_type) :: non_coarray
+
+  type(notify_type) :: notify_var[*], redundant_notify[*]
+  integer :: count, sync_status
+  character(len=128) :: error_message
+
+  !____________________ non-standard-conforming statements __________________________
+
+  !_________________________ invalid notify-variable ________________________________
+
+  ! notify-variable has an unknown expression
+  !ERROR: expected '('
+  notify wait(notify=notify_var)
+
+  !_____________ invalid event-wait-spec-lists: invalid until-spec _________________
+
+  ! Invalid until-spec keyword
+  !ERROR: expected '('
+  notify wait(notify_var, until_amount=count)
+
+  ! Invalid until-spec: missing until-spec variable
+  !ERROR: expected '('
+  notify wait(notify_var, until_count)
+
+  ! Invalid until-spec: missing 'until_count='
+  !ERROR: expected '('
+  notify wait(notify_var, count)
+
+  !_________________ invalid sync-stat-lists: invalid stat= ________________________
+
+  ! Invalid stat-variable keyword
+  !ERROR: expected '('
+  notify wait(notify_var, status=sync_status)
+
+  ! Invalid sync-stat-list: missing stat-variable
+  !ERROR: expected '('
+  notify wait(notify_var, stat)
+
+  ! Invalid sync-stat-list: missing 'stat='
+  !ERROR: expected '('
+  notify wait(notify_var, sync_status)
+
+  !________________ invalid sync-stat-lists: invalid errmsg= _______________________
+
+  ! Invalid errmsg-variable keyword
+  !ERROR: expected '('
+  notify wait(notify_var, errormsg=error_message)
+
+  ! Invalid sync-stat-list: missing 'errmsg='
+  !ERROR: expected '('
+  notify wait(notify_var, error_message)
+
+  ! Invalid sync-stat-list: missing errmsg-variable
+  !ERROR: expected '('
+  notify wait(notify_var, errmsg)
+
+  !______________ invalid notify-variable: redundant notify-variable _________________
+
+  !ERROR: expected '('
+  notify wait(notify_var, redundant_notify)
+
+  !ERROR: expected '('
+  notify wait(notify_var, redundant_notify, stat=sync_status, errmsg=error_message)
+
+end program test_notify_wait
diff --git a/flang/test/Semantics/notifywait03.f90 b/flang/test/Semantics/notifywait03.f90
new file mode 100644
index 0000000000000..0fc56f66ad32d
--- /dev/null
+++ b/flang/test/Semantics/notifywait03.f90
@@ -0,0 +1,123 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! This test checks for semantic errors in notify wait statements based on the
+! statement specification in section 11.6 of the Fortran 2023 standard.
+! Some of the errors in this test would be hidden by the errors in
+! the test notify02.f90 if they were included in that file,
+! and are thus tested here.
+
+program test_notify_wait
+  use iso_fortran_env, only : notify_type
+  implicit none
+
+  ! notify_type variables must be coarrays
+  type(notify_type) :: non_coarray
+
+  type(notify_type) :: notify_var[*], notify_array(2)[*]
+  integer :: count, count_array(1), non_notify[*], sync_status, coindexed_integer[*], superfluous_stat, non_scalar(1)
+  character(len=128) :: error_message, non_scalar_char(1), coindexed_character[*], superfluous_errmsg
+  logical :: invalid_type
+
+  !____________________ non-standard-conforming statements __________________________
+
+  !_________________________ invalid notify-variable ________________________________
+
+  !ERROR: The notify-variable must be of type NOTIFY_TYPE from module ISO_FORTRAN_ENV
+  notify wait(non_notify)
+
+  !ERROR: The notify-variable must be a coarray
+  notify wait(non_coarray)
+
+  !ERROR: A notify-variable in a NOTIFY WAIT statement may not be a coindexed object
+  notify wait(notify_var[1])
+
+  !ERROR: A notify-variable in a NOTIFY WAIT statement may not be a coindexed object
+  notify wait(notify_array(1)[1])
+
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  notify wait(notify_array)
+
+  !_____________ invalid event-wait-spec-lists: invalid until-spec _________________
+
+  !ERROR: Must have INTEGER type, but is LOGICAL(4)
+  notify wait(notify_var, until_count=invalid_type)
+
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  notify wait(notify_var, until_count=non_scalar)
+
+  !_________________ invalid sync-stat-lists: invalid stat= ________________________
+
+  !ERROR: Must have INTEGER type, but is LOGICAL(4)
+  notify wait(notify_var, stat=invalid_type)
+
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  notify wait(notify_var, stat=non_scalar)
+
+  !________________ invalid sync-stat-lists: invalid errmsg= _______________________
+
+  !ERROR: Must have CHARACTER type, but is LOGICAL(4)
+  notify wait(notify_var, errmsg=invalid_type)
+
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  notify wait(notify_var, errmsg=non_scalar_char)
+
+  !______ invalid event-wait-spec-lists: redundant event-wait-spec-list ____________
+
+  !ERROR: Until-spec in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, until_count=count, until_count=count_array(1))
+
+  !ERROR: Until-spec in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, until_count=count, stat=sync_status, until_count=count_array(1))
+
+  !ERROR: Until-spec in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, until_count=count, errmsg=error_message, until_count=count_array(1))
+
+  !ERROR: Until-spec in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, until_count=count, stat=sync_status, errmsg=error_message, until_count=count_array(1))
+
+  !ERROR: A stat-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, stat=sync_status, stat=superfluous_stat)
+
+  !ERROR: A stat-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, stat=sync_status, until_count=count, stat=superfluous_stat)
+
+  !ERROR: A stat-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, stat=sync_status, errmsg=error_message, stat=superfluous_stat)
+
+  !ERROR: A stat-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, stat=sync_status, until_count=count, errmsg=error_message, stat=superfluous_stat)
+
+  !ERROR: A errmsg-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, errmsg=error_message, errmsg=superfluous_errmsg)
+
+  !ERROR: A errmsg-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, errmsg=error_message, until_count=count, errmsg=superfluous_errmsg)
+
+  !ERROR: A errmsg-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, errmsg=error_message, stat=superfluous_stat, errmsg=superfluous_errmsg)
+
+  !ERROR: A errmsg-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, errmsg=error_message, until_count=count, stat=superfluous_stat, errmsg=superfluous_errmsg)
+
+  !_____________ invalid sync-stat-lists: coindexed stat-variable - C1173 __________________
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, stat=coindexed_integer[1])
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, errmsg=coindexed_character[1])
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, stat=coindexed_integer[1], errmsg=error_message)
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, stat=sync_status, errmsg=coindexed_character[1])
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, stat=coindexed_integer[1], errmsg=coindexed_character[1])
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, errmsg=coindexed_character[1], stat=coindexed_integer[1])
+
+end program test_notify_wait

From 3caef46631190fe24b16704fd22185a3083949a5 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Tue, 2 Jan 2024 13:47:13 -0500
Subject: [PATCH 066/313] [libc][NFC] Enforce internal linkage for exp* support
 functions. (#76250)

---
 libc/src/math/generic/exp.cpp   | 4 ++++
 libc/src/math/generic/exp10.cpp | 4 ++++
 libc/src/math/generic/exp2.cpp  | 4 ++++
 libc/src/math/generic/expm1.cpp | 4 ++++
 4 files changed, 16 insertions(+)

diff --git a/libc/src/math/generic/exp.cpp b/libc/src/math/generic/exp.cpp
index 5428a04430887..c8e36404a7812 100644
--- a/libc/src/math/generic/exp.cpp
+++ b/libc/src/math/generic/exp.cpp
@@ -51,6 +51,8 @@ constexpr double MLOG_2_EXP2_M12_MID = 0x1.718432a1b0e26p-47;
 constexpr double MLOG_2_EXP2_M12_MID_30 = 0x1.718432ap-47;
 constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
 
+namespace {
+
 // Polynomial approximations with double precision:
 // Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
 // For |dx| < 2^-13 + 2^-30:
@@ -218,6 +220,8 @@ double set_exceptional(double x) {
   return x + static_cast<double>(FPBits::inf());
 }
 
+} // namespace
+
 LLVM_LIBC_FUNCTION(double, exp, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
   using FloatProp = typename fputil::FloatProperties<double>;
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index aa66d4f17a3a0..92b3a468a9cc9 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -52,6 +52,8 @@ constexpr double ERR_D = 0x1.8p-63;
 // Errors when using double-double precision.
 constexpr double ERR_DD = 0x1.8p-99;
 
+namespace {
+
 // Polynomial approximations with double precision.  Generated by Sollya with:
 // > P = fpminimax((10^x - 1)/x, 3, [|D...|], [-2^-14, 2^-14]);
 // > P;
@@ -268,6 +270,8 @@ double set_exceptional(double x) {
   return x + static_cast<double>(FPBits::inf());
 }
 
+} // namespace
+
 LLVM_LIBC_FUNCTION(double, exp10, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
   using FloatProp = typename fputil::FloatProperties<double>;
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 3e9f9c6855c43..44aeb14e231bf 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -42,6 +42,8 @@ constexpr double ERR_D = 0x1.8p-63;
 // Errors when using double-double precision.
 constexpr double ERR_DD = 0x1.0p-100;
 
+namespace {
+
 // Polynomial approximations with double precision.  Generated by Sollya with:
 // > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
 // > P;
@@ -243,6 +245,8 @@ double set_exceptional(double x) {
   return x + static_cast<double>(FPBits::inf());
 }
 
+} // namespace
+
 LLVM_LIBC_FUNCTION(double, exp2, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
   using FloatProp = typename fputil::FloatProperties<double>;
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index e7cc240839759..deb3b0adc0ead 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -61,6 +61,8 @@ constexpr double MLOG_2_EXP2_M12_MID = 0x1.718432a1b0e26p-47;
 constexpr double MLOG_2_EXP2_M12_MID_30 = 0x1.718432ap-47;
 constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
 
+namespace {
+
 // Polynomial approximations with double precision:
 // Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
 // For |dx| < 2^-13 + 2^-30:
@@ -269,6 +271,8 @@ double set_exceptional(double x) {
   return x + static_cast<double>(FPBits::inf());
 }
 
+} // namespace
+
 LLVM_LIBC_FUNCTION(double, expm1, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
   using FloatProp = typename fputil::FloatProperties<double>;

From d6c4d4c9b910e8ad5ed7cd4825a143742041c1f4 Mon Sep 17 00:00:00 2001
From: Karthika Devi C <quic_kartc@quicinc.com>
Date: Wed, 3 Jan 2024 00:23:29 +0530
Subject: [PATCH 067/313] [polly][ScheduleOptimizer] Fix long compile
 time(hang) reported in polly (#75141)

There is no upper cap set on current Schedule Optimizer to compute
schedule. In some cases a very long compile time taken to compute the
schedule resulting in hang kind of behavior. This patch introduces a
flag 'polly-schedule-computeout' to pass the capwhich is initialized to
300000. This patch handles the compute out cases by bailing out and
exiting gracefully.

Fixes #69090
---
 polly/lib/Transform/ScheduleOptimizer.cpp     | 18 +++-
 .../ScheduleOptimizer/schedule_computeout.ll  | 94 +++++++++++++++++++
 2 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 polly/test/ScheduleOptimizer/schedule_computeout.ll

diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 35a0a4def0403..5a0ea3b406754 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -96,6 +96,13 @@ static cl::opt<std::string>
                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
                       cl::init("yes"), cl::cat(PollyCategory));
 
+static cl::opt<int>
+    ScheduleComputeOut("polly-schedule-computeout",
+                       cl::desc("Bound the scheduler by maximal amount"
+                                "of computational steps. "),
+                       cl::Hidden, cl::init(300000), cl::ZeroOrMore,
+                       cl::cat(PollyCategory));
+
 static cl::opt<bool>
     GreedyFusion("polly-loopfusion-greedy",
                  cl::desc("Aggressively try to fuse everything"), cl::Hidden,
@@ -860,7 +867,16 @@ static void runIslScheduleOptimizer(
     SC = SC.set_proximity(Proximity);
     SC = SC.set_validity(Validity);
     SC = SC.set_coincidence(Validity);
-    Schedule = SC.compute_schedule();
+
+    {
+      IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut);
+      Schedule = SC.compute_schedule();
+
+      if (MaxOpGuard.hasQuotaExceeded())
+        LLVM_DEBUG(
+            dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
+    }
+
     isl_options_set_on_error(Ctx, OnErrorStatus);
 
     ScopsRescheduled++;
diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll
new file mode 100644
index 0000000000000..db46ef5021932
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/schedule_computeout.ll
@@ -0,0 +1,94 @@
+; RUN: opt -S -polly-optree -polly-delicm  -polly-opt-isl -polly-schedule-computeout=100000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
+; Bailout if the computations of schedule compute exceeds the max scheduling quota.
+; Max compute out is initialized to 300000, Here it is set to 100000 for test purpose.
+
+@a = dso_local local_unnamed_addr global ptr null, align 8
+@b = dso_local local_unnamed_addr global ptr null, align 8
+@c = dso_local local_unnamed_addr global ptr null, align 8
+
+define dso_local void @foo(i32 noundef %I, i32 noundef %J, i32 noundef %K1, i32 noundef %K2, i32 noundef %L1, i32 noundef %L2) local_unnamed_addr {
+entry:
+  %j = alloca i32, align 4
+  store volatile i32 0, ptr %j, align 4
+  %j.0.j.0.j.0.54 = load volatile i32, ptr %j, align 4
+  %cmp55 = icmp slt i32 %j.0.j.0.j.0.54, %J
+  br i1 %cmp55, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load ptr, ptr @a, align 8
+  %1 = load ptr, ptr @b, align 8
+  %2 = load ptr, ptr %1, align 8
+  %cmp352 = icmp slt i32 %L1, %L2
+  %cmp750 = icmp slt i32 %K1, %K2
+  %3 = sext i32 %K1 to i64
+  %4 = sext i32 %L1 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup4, %entry
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup4, %for.body.lr.ph
+  br i1 %cmp352, label %for.cond6.preheader.preheader, label %for.cond.cleanup4
+
+for.cond6.preheader.preheader:                    ; preds = %for.body
+  %wide.trip.count66 = sext i32 %L2 to i64
+  br label %for.cond6.preheader
+
+for.cond6.preheader:                              ; preds = %for.cond.cleanup8, %for.cond6.preheader.preheader
+  %indvars.iv61 = phi i64 [ %4, %for.cond6.preheader.preheader ], [ %indvars.iv.next62, %for.cond.cleanup8 ]
+  br i1 %cmp750, label %for.cond10.preheader.lr.ph, label %for.cond.cleanup8
+
+for.cond10.preheader.lr.ph:                       ; preds = %for.cond6.preheader
+  %5 = mul nsw i64 %indvars.iv61, 516
+  %6 = mul nsw i64 %indvars.iv61, 516
+  %wide.trip.count = sext i32 %K2 to i64
+  br label %for.cond10.preheader
+
+for.cond.cleanup4:                                ; preds = %for.cond.cleanup8, %for.body
+  %j.0.j.0.j.0.45 = load volatile i32, ptr %j, align 4
+  %inc34 = add nsw i32 %j.0.j.0.j.0.45, 1
+  store volatile i32 %inc34, ptr %j, align 4
+  %j.0.j.0.j.0. = load volatile i32, ptr %j, align 4
+  %cmp = icmp slt i32 %j.0.j.0.j.0., %J
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond10.preheader:                             ; preds = %for.cond.cleanup12, %for.cond10.preheader.lr.ph
+  %indvars.iv = phi i64 [ %3, %for.cond10.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup12 ]
+  %7 = getelementptr float, ptr %0, i64 %indvars.iv
+  %arrayidx18 = getelementptr float, ptr %7, i64 %5
+  %8 = load float, ptr %arrayidx18, align 4
+  br label %for.cond14.preheader
+
+for.cond.cleanup8:                                ; preds = %for.cond.cleanup12, %for.cond6.preheader
+  %indvars.iv.next62 = add nsw i64 %indvars.iv61, 1
+  %exitcond67.not = icmp eq i64 %indvars.iv.next62, %wide.trip.count66
+  br i1 %exitcond67.not, label %for.cond.cleanup4, label %for.cond6.preheader
+
+for.cond14.preheader:                             ; preds = %for.cond.cleanup16, %for.cond10.preheader
+  %m.049 = phi i32 [ -2, %for.cond10.preheader ], [ %inc21, %for.cond.cleanup16 ]
+  %sum.048 = phi float [ 0.000000e+00, %for.cond10.preheader ], [ %add19, %for.cond.cleanup16 ]
+  br label %for.body17
+
+for.cond.cleanup12:                               ; preds = %for.cond.cleanup16
+  %9 = getelementptr float, ptr %2, i64 %indvars.iv
+  %arrayidx26 = getelementptr float, ptr %9, i64 %6
+  store float %add19, ptr %arrayidx26, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond60.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond60.not, label %for.cond.cleanup8, label %for.cond10.preheader
+
+for.cond.cleanup16:                               ; preds = %for.body17
+  %inc21 = add nsw i32 %m.049, 1
+  %exitcond56.not = icmp eq i32 %inc21, 3
+  br i1 %exitcond56.not, label %for.cond.cleanup12, label %for.cond14.preheader
+
+for.body17:                                       ; preds = %for.body17, %for.cond14.preheader
+  %n.047 = phi i32 [ -2, %for.cond14.preheader ], [ %inc, %for.body17 ]
+  %sum.146 = phi float [ %sum.048, %for.cond14.preheader ], [ %add19, %for.body17 ]
+  %add19 = fadd float %sum.146, %8
+  %inc = add nsw i32 %n.047, 1
+  %exitcond.not = icmp eq i32 %inc, 3
+  br i1 %exitcond.not, label %for.cond.cleanup16, label %for.body17
+}
+
+; CHECK: Schedule optimizer calculation exceeds ISL quota

From 3af59cfe0b5a319c165e3c74300aacdd42827c2d Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov
 <6532716+alexander-shaposhnikov@users.noreply.github.com>
Date: Tue, 2 Jan 2024 11:00:03 -0800
Subject: [PATCH 068/313] [ConstraintElim] Add facts implied by llvm.abs
 (#73189)

Add  "abs(x) >=s x" fact.

https://alive2.llvm.org/ce/z/gOrrU3

Test plan: ninja check-all
---
 .../Scalar/ConstraintElimination.cpp          | 47 ++++++++++++-------
 .../Transforms/ConstraintElimination/abs.ll   | 21 +++------
 2 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 899d7e0a11e6f..49ac1e96e2554 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1010,22 +1010,14 @@ void State::addInfoFor(BasicBlock &BB) {
       continue;
     }
 
-    if (match(&I, m_Intrinsic<Intrinsic::ssub_with_overflow>())) {
-      WorkList.push_back(
-          FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I)));
-      continue;
-    }
-
-    if (isa<MinMaxIntrinsic>(&I)) {
-      WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I));
-      continue;
-    }
-
-    Value *A, *B;
-    CmpInst::Predicate Pred;
-    // For now, just handle assumes with a single compare as condition.
-    if (match(&I, m_Intrinsic<Intrinsic::assume>(
-                      m_ICmp(Pred, m_Value(A), m_Value(B))))) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
+    switch (ID) {
+    case Intrinsic::assume: {
+      Value *A, *B;
+      CmpInst::Predicate Pred;
+      if (!match(I.getOperand(0), m_ICmp(Pred, m_Value(A), m_Value(B))))
+        break;
       if (GuaranteedToExecute) {
         // The assume is guaranteed to execute when BB is entered, hence Cond
         // holds on entry to BB.
@@ -1035,7 +1027,23 @@ void State::addInfoFor(BasicBlock &BB) {
         WorkList.emplace_back(
             FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I));
       }
+      break;
     }
+    // Enqueue ssub_with_overflow for simplification.
+    case Intrinsic::ssub_with_overflow:
+      WorkList.push_back(
+          FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I)));
+      break;
+    // Enqueue the intrinsics to add extra info.
+    case Intrinsic::abs:
+    case Intrinsic::umin:
+    case Intrinsic::umax:
+    case Intrinsic::smin:
+    case Intrinsic::smax:
+      WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I));
+      break;
+    }
+
     GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I);
   }
 
@@ -1693,6 +1701,13 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
 
     ICmpInst::Predicate Pred;
     if (!CB.isConditionFact()) {
+      Value *X;
+      if (match(CB.Inst, m_Intrinsic<Intrinsic::abs>(m_Value(X)))) {
+        // TODO: Add CB.Inst >= 0 fact.
+        AddFact(CmpInst::ICMP_SGE, CB.Inst, X);
+        continue;
+      }
+
       if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(CB.Inst)) {
         Pred = ICmpInst::getNonStrictPredicate(MinMax->getPredicate());
         AddFact(Pred, MinMax, MinMax->getLHS());
diff --git a/llvm/test/Transforms/ConstraintElimination/abs.ll b/llvm/test/Transforms/ConstraintElimination/abs.ll
index b6d5e20302996..c3162ba48c2e4 100644
--- a/llvm/test/Transforms/ConstraintElimination/abs.ll
+++ b/llvm/test/Transforms/ConstraintElimination/abs.ll
@@ -5,8 +5,7 @@ define i1 @abs_int_min_is_not_poison(i32 %arg) {
 ; CHECK-LABEL: define i1 @abs_int_min_is_not_poison(
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 false)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[ABS]], [[ARG]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 false)
   %cmp = icmp sge i32 %abs, %arg
@@ -17,8 +16,7 @@ define i1 @abs_int_min_is_poison(i32 %arg) {
 ; CHECK-LABEL: define i1 @abs_int_min_is_poison(
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[ABS]], [[ARG]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
   %cmp = icmp sge i32 %abs, %arg
@@ -30,8 +28,7 @@ define i1 @abs_plus_one(i32 %arg) {
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
 ; CHECK-NEXT:    [[ABS_PLUS_ONE:%.*]] = add nsw i32 [[ABS]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[ABS_PLUS_ONE]], [[ARG]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
   %abs_plus_one = add nsw i32 %abs, 1
@@ -44,8 +41,7 @@ define i1 @arg_minus_one_strict_less(i32 %arg) {
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
 ; CHECK-NEXT:    [[ARG_MINUS_ONE:%.*]] = add nsw i32 [[ARG]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ARG_MINUS_ONE]], [[ABS]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
   %arg_minus_one = add nsw i32 %arg, -1
@@ -58,8 +54,7 @@ define i1 @arg_minus_one_strict_greater(i32 %arg) {
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
 ; CHECK-NEXT:    [[ARG_MINUS_ONE:%.*]] = add nsw i32 [[ARG]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ARG_MINUS_ONE]], [[ABS]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
   %arg_minus_one = add nsw i32 %arg, -1
@@ -74,8 +69,7 @@ define i1 @abs_plus_one_unsigned_greater_or_equal_nonnegative_arg(i32 %arg) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ARG_NONNEGATIVE]])
 ; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
 ; CHECK-NEXT:    [[ABS_PLUS_ONE:%.*]] = add nuw i32 [[ABS]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[ABS_PLUS_ONE]], [[ARG]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp_arg_nonnegative = icmp sge i32 %arg, 0
   call void @llvm.assume(i1 %cmp_arg_nonnegative)
@@ -113,8 +107,7 @@ define i1 @abs_constant_negative_arg() {
 define i1 @abs_constant_positive_arg() {
 ; CHECK-LABEL: define i1 @abs_constant_positive_arg() {
 ; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 3, i1 false)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[ABS]], 3
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %abs = tail call i32 @llvm.abs.i32(i32 3, i1 false)
   %cmp = icmp sge i32 %abs, 3

From 11ac97c67a9315dce48fd938d68ae991e3559f10 Mon Sep 17 00:00:00 2001
From: Felix Schneider <fx.schn@gmail.com>
Date: Tue, 2 Jan 2024 20:04:01 +0100
Subject: [PATCH 069/313] [mlir][tosa] Move lowering of `tosa.transpose` to
 `tosa-to-linalg-named` (#75738)

Currently, there exists a pattern lowering `tosa.transpose` to
`linalg.generic` in `tosa-to-linalg`.
This patch removes that and instead adds a pattern lowering
`tosa.transpose` to `linalg.transpose` in `tosa-to-linalg-named`.
Lowering to the named linalg Op has the advantage that following
optimization passes can easily identify transposition without having to
perform pattern matching on linalg.generic Ops. The `linalg.transpose`
can simply be generalized to a `linalg.generic` in a second step.
---
 .../Conversion/TosaToLinalg/TosaToLinalg.cpp  | 52 +-----------
 .../TosaToLinalg/TosaToLinalgNamed.cpp        | 30 ++++++-
 .../TosaToLinalg/TosaToLinalgNamedPass.cpp    |  1 +
 .../TosaToLinalg/tosa-to-linalg-named.mlir    | 75 ++++++++++++++--
 .../TosaToLinalg/tosa-to-linalg-pipeline.mlir | 10 ---
 .../TosaToLinalg/tosa-to-linalg.mlir          | 85 -------------------
 6 files changed, 99 insertions(+), 154 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index beed71d93d9b0..7c35389f1e929 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -1052,55 +1052,6 @@ class PointwiseConverter : public OpRewritePattern<SrcOp> {
   }
 };
 
-class TransposeConverter : public OpRewritePattern<tosa::TransposeOp> {
-public:
-  using OpRewritePattern<tosa::TransposeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tosa::TransposeOp op,
-                                PatternRewriter &rewriter) const final {
-    DenseIntElementsAttr perms;
-    if (!matchPattern(op.getPerms(), m_Constant(&perms))) {
-      return rewriter.notifyMatchFailure(op, "unmatched permutation tensor");
-    }
-
-    auto loc = op.getLoc();
-    auto input = op->getOperand(0);
-    auto resultTy = cast<ShapedType>(op.getType());
-
-    SmallVector<Value> dynDims;
-    dynDims.resize(cast<ShapedType>(op->getResult(0).getType()).getRank());
-
-    SmallVector<AffineExpr, 2> inputExprs;
-    inputExprs.resize(resultTy.getRank());
-    for (const auto &permutation : llvm::enumerate(perms.getValues<APInt>())) {
-      auto index = permutation.index();
-      auto value = permutation.value().getZExtValue();
-      if (!resultTy.hasRank() || resultTy.isDynamicDim(index)) {
-        dynDims[index] = rewriter.create<tensor::DimOp>(loc, input, value);
-      }
-      inputExprs[value] = rewriter.getAffineDimExpr(index);
-    }
-
-    SmallVector<Value> filteredDims = condenseValues(dynDims);
-
-    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, resultTy.getShape(), resultTy.getElementType(), filteredDims);
-
-    SmallVector<AffineMap, 2> affineMaps = {
-        AffineMap::get(resultTy.getRank(), /*symbolCount=*/0, inputExprs,
-                       rewriter.getContext()),
-        rewriter.getMultiDimIdentityMap(resultTy.getRank())};
-
-    rewriter.replaceOpWithNewOp<linalg::GenericOp>(
-        op, resultTy, op.getInput1(), ValueRange{emptyTensor}, affineMaps,
-        getNParallelLoopsAttrs(resultTy.getRank()),
-        [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
-          nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
-        });
-    return success();
-  }
-};
-
 class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
 public:
   using OpRewritePattern<tosa::RescaleOp>::OpRewritePattern;
@@ -2454,7 +2405,6 @@ void mlir::tosa::populateTosaToLinalgConversionPatterns(
       ReverseConverter,
       RFFT2dConverter,
       TableConverter,
-      TileConverter,
-      TransposeConverter>(patterns->getContext());
+      TileConverter>(patterns->getContext());
   // clang-format on
 }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index b3fbc7dd0b22c..8dc2d27bd545f 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
@@ -984,6 +985,31 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
   }
 };
 
+class TransposeConverter : public OpRewritePattern<tosa::TransposeOp> {
+public:
+  using OpRewritePattern<tosa::TransposeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tosa::TransposeOp op,
+                                PatternRewriter &rewriter) const final {
+    SmallVector<int64_t> constantPerms;
+    if (failed(op.getConstantPerms(constantPerms)))
+      return failure();
+
+    Location loc = op.getLoc();
+    // The verifier should have made sure we have a valid permutation tensor.
+    assert(isPermutationVector(constantPerms) && "Expected valid permutation");
+    SmallVector<OpFoldResult> inputSizes =
+        tensor::getMixedSizes(rewriter, loc, op.getInput1());
+    auto permutedSizes =
+        applyPermutation<OpFoldResult>(inputSizes, constantPerms);
+
+    auto permutedInit = rewriter.create<tensor::EmptyOp>(
+        loc, permutedSizes, op.getInput1().getType().getElementType());
+    rewriter.replaceOpWithNewOp<linalg::TransposeOp>(
+        op, op.getInput1(), permutedInit, constantPerms);
+    return success();
+  }
+};
 } // namespace
 
 void mlir::tosa::populateTosaToLinalgNamedConversionPatterns(
@@ -1004,6 +1030,8 @@ void mlir::tosa::populateTosaToLinalgNamedConversionPatterns(
       MatMulConverter,
       MaxPool2dConverter,
       AvgPool2dConverter,
-      FullyConnectedConverter>(patterns->getContext());
+      FullyConnectedConverter,
+      TransposeConverter
+  >(patterns->getContext());
   // clang-format on
 }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
index 5312dc164c26c..096969391e51b 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
@@ -60,6 +60,7 @@ struct TosaToLinalgNamed
     target.addIllegalOp<tosa::AvgPool2dOp>();
     target.addIllegalOp<tosa::MatMulOp>();
     target.addIllegalOp<tosa::FullyConnectedOp>();
+    target.addIllegalOp<tosa::TransposeOp>();
 
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index aa010e759a0f2..6616ea7cf699f 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -88,7 +88,8 @@ func.func @matmul_dyn_output(%arg0: tensor<1x1x8xf32>, %arg1: tensor<1x8x1xf32>)
 // CHECK-LABEL: @fully_connected
 func.func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<5x6xf32>) {
   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
-  // CHECK: %[[TRANSPOSED:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xf32>, tensor<2xi64>) -> tensor<3x6xf32>
+  // CHECK: %[[TRANSPOSEDINIT:.+]] = tensor.empty() : tensor<3x6xf32>
+  // CHECK: %[[TRANSPOSED:.+]] = linalg.transpose ins(%arg1 : tensor<6x3xf32>) outs(%[[TRANSPOSEDINIT]] : tensor<3x6xf32>) permutation = [1, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<5x6xf32>
 
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xf32>) outs(%[[INIT]] : tensor<5x6xf32>) {
@@ -110,7 +111,7 @@ func.func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2
 // CHECK-LABEL: @quantized_fully_connected
 func.func @quantized_fully_connected(%arg0: tensor<5x3xi8>, %arg1: tensor<6x3xi8>, %arg2: tensor<6xi32>) -> (tensor<5x6xi32>) {
   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
-  // CHECK: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xi8>, tensor<2xi64>) -> tensor<3x6xi8>
+  // CHECK: %[[TRANSPOSE:.+]] =  linalg.transpose ins(%arg1 : tensor<6x3xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x6xi8>) permutation = [1, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<5x6xi32>
 
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xi32>) outs(%[[INIT]] : tensor<5x6xi32>) {
@@ -136,7 +137,7 @@ func.func @fully_connected_dyn(%arg0: tensor<?x3xf32>, %arg1: tensor<6x3xf32>, %
   // CHECK: %[[C0:.+]] = arith.constant 0 : index
   // CHECK: %[[DIM0:.+]] = tensor.dim %arg0, %c0 : tensor<?x3xf32>
   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
-  // CHECK: %[[TRANSPOSED:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xf32>, tensor<2xi64>) -> tensor<3x6xf32>
+  // CHECK: %[[TRANSPOSED:.+]] = linalg.transpose ins(%arg1 : tensor<6x3xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x6xf32>) permutation = [1, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM0]]) : tensor<?x6xf32>
 
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xf32>) outs(%[[INIT]] : tensor<?x6xf32>) {
@@ -377,7 +378,7 @@ func.func @avg_pool_dyn(%arg0: tensor<?x6x34x62xf32>) -> (tensor<?x5x33x62xf32>)
 // CHECK-LABEL: @conv2d_i8
 func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi8>, %bias: tensor<28xi8>) -> () {
   // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
-  // HWCF: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[TRANSPOSE_DIMS]] : (tensor<28x1x1x27xi8>, tensor<4xi64>) -> tensor<1x1x27x28xi8>
+  // HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x1x1x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<1x1x27x28xi8>) permutation = [1, 2, 3, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xi32>
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xi8>) outs(%[[INIT]] : tensor<1x45x40x28xi32>) {
   // CHECK:   arith.extsi
@@ -398,7 +399,7 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi
 // CHECK-LABEL: @conv2d_f32
 func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
   // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
-  // HWCF: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[TRANSPOSE_DIMS]] : (tensor<28x3x3x27xf32>, tensor<4xi64>) -> tensor<3x3x27x28xf32>
+  // HWCF: %[[TRANSPOSE:.+]] =  linalg.transpose ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x3x27x28xf32>) permutation = [1, 2, 3, 0]
 
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) {
@@ -677,7 +678,7 @@ func.func @depthwise_conv2d_dyn_w_h(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<3x
 // CHECK-LABEL: @conv3d_f32
 func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4x5x27xf32>, %bias: tensor<28xf32>) -> () {
   // CHECK-DAG:  %[[PERMS:.+]] = arith.constant dense<[1, 2, 3, 4, 0]>
-  // CHECK-DAG:  %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[PERMS]]
+  // CHECK-DAG:  %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xf32>) permutation = [1, 2, 3, 4, 0]
   // CHECK-DAG:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xf32>
   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
@@ -701,7 +702,7 @@ func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4
 // CHECK-LABEL: @conv3d_i8
 func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5x27xi8>, %bias: tensor<28xi32>) -> () {
   // CHECK-DAG:  %[[PERMS:.+]] = arith.constant dense<[1, 2, 3, 4, 0]>
-  // CHECK-DAG:  %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[PERMS]]
+  // CHECK-DAG:  %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xi8>) permutation = [1, 2, 3, 4, 0]
   // CHECK-DAG:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xi32>
   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
@@ -720,3 +721,63 @@ func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5
   %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xi8>, tensor<28x3x4x5x27xi8>, tensor<28xi32>)  -> tensor<1x47x45x43x28xi32>
   return
 }
+
+// -----
+
+// CHECK-LABEL: @test_transpose
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<1x2x3xi32>)
+func.func @test_transpose(%arg0: tensor<1x2x3xi32>) -> () {
+  %0 = arith.constant dense<[1, 2, 0]> : tensor<3xi32>
+  // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<2x3x1xi32>
+  // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<1x2x3xi32>) outs(%[[INIT]] : tensor<2x3x1xi32>) permutation = [1, 2, 0]
+  %1 = tosa.transpose %arg0, %0 : (tensor<1x2x3xi32>, tensor<3xi32>)  -> tensor<2x3x1xi32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @test_transpose_dyn
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<1x?x3x4xi32>)
+func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
+  %0 = arith.constant dense<[1, 3, 0, 2]> : tensor<4xi32>
+  // CHECK: %[[C1:.+]] = arith.constant 1
+  // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]]) : tensor<?x4x1x3xi32>
+  // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<1x?x3x4xi32>) outs(%[[INIT]] : tensor<?x4x1x3xi32>) permutation = [1, 3, 0, 2]
+  %1 = tosa.transpose %arg0, %0 : (tensor<1x?x3x4xi32>, tensor<4xi32>)  -> tensor<?x4x1x3xi32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @test_transpose_dyn_multiple_2d
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
+func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
+  %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
+  // CHECK-DAG: %[[C0:.+]] = arith.constant 0
+  // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+  // CHECK-DAG: %[[C1:.+]] = arith.constant 1
+  // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]])
+  // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<?x?xf32>) outs(%[[INIT]] : tensor<?x?xf32>) permutation = [1, 0]
+  %1 = tosa.transpose %arg0, %0 : (tensor<?x?xf32>, tensor<2xi32>)  -> tensor<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @test_transpose_dyn_multiple_3d
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x?xf32>)
+func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
+  %0 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+  // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM2]], %[[DIM0]], %[[DIM1]]) : tensor<?x?x?xf32>
+  // CHECK: %[[TRANSPOSE:.*]] = linalg.transpose ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?xf32>) permutation = [2, 0, 1]
+  %1 = "tosa.transpose"(%arg0, %0) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+  return
+}
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
index e009cab4b1bf5..c2bbfd5130ebc 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
@@ -38,13 +38,3 @@ func.func @avg_pool2d_with_unsupported_quant_type(%arg0: tensor<1x7x7x9x!quant.u
   %0 = "tosa.avg_pool2d"(%arg0) {acc_type = i32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} : (tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>) -> tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>
   return %0 : tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>
 }
-
-// -----
-
-// check that --tosa-validate=strict-op-spec-alignment does not kick in because tosa-to-linalg-named comes before tosa-validate
-// this would have failed tosa strict-op-spec-alignment because perms of transpose is not constant
-// but tosa.transpose is lowered by tosa-to-linalg-named pass which is earlier than tosa-validate pass in the pipeline
-func.func @test_transpose_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> {
-  %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>
-  return %0 : tensor<3x13x21xf32>
-}
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index e0e041139fe4d..535316e5a3721 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -820,7 +820,6 @@ func.func @test_negate_quantized(%arg0: tensor<1xi8>) -> () {
   return
 }
 
-
 // -----
 
 // CHECK-LABEL: @test_identity
@@ -836,90 +835,6 @@ func.func @test_identity(%arg0: tensor<1xf32>, %arg1: tensor<1xi32>) -> (tensor<
 
 // -----
 
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-// CHECK-LABEL: @test_transpose
-// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x2x3xi32>)
-func.func @test_transpose(%arg0: tensor<1x2x3xi32>) -> () {
-  %0 = arith.constant dense<[1, 2, 0]> : tensor<3xi32>
-  // CHECK: [[INIT:%.+]] = tensor.empty() : tensor<2x3x1xi32>
-  // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins([[ARG0]] : tensor<1x2x3xi32>) outs([[OUT:%.+]] : tensor<2x3x1xi32>)
-  // CHECK: ^bb0([[ARG1:%.+]]: i32, [[ARG2:%.+]]: i32)
-  // CHECK:   linalg.yield [[ARG1]]
-  // CHECK: }
-  %1 = tosa.transpose %arg0, %0 : (tensor<1x2x3xi32>, tensor<3xi32>)  -> tensor<2x3x1xi32>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-
-// CHECK-LABEL: @test_transpose_dyn
-// CHECK-SAME: (%[[ARG0:.+]]: tensor<1x?x3x4xi32>)
-func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
-  %0 = arith.constant dense<[1, 3, 0, 2]> : tensor<4xi32>
-  // CHECK: %[[C1:.+]] = arith.constant 1
-  // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C1]]
-  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]]) : tensor<?x4x1x3xi32>
-  // CHECK: %[[GENERIC:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[ARG0]] : tensor<1x?x3x4xi32>) outs([[OUT:%.+]] : tensor<?x4x1x3xi32>)
-  // CHECK: ^bb0([[ARG1:%.+]]: i32, [[ARG2:%.+]]: i32)
-  // CHECK:   linalg.yield [[ARG1]]
-  // CHECK: }
-  %1 = tosa.transpose %arg0, %0 : (tensor<1x?x3x4xi32>, tensor<4xi32>)  -> tensor<?x4x1x3xi32>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @test_transpose_dyn_multiple_2d
-// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
-func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
-  %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
-  // CHECK-DAG: %[[C0:.+]] = arith.constant 0
-  // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
-  // CHECK-DAG: %[[C1:.+]] = arith.constant 1
-  // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
-  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]])
-  // CHECK: %[[GENERIC:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?xf32>) outs([[OUT:%.+]] : tensor<?x?xf32>)
-  // CHECK: ^bb0([[ARG1:%.+]]: f32, [[ARG2:%.+]]: f32)
-  // CHECK:   linalg.yield [[ARG1]]
-  // CHECK: }
-  %1 = tosa.transpose %arg0, %0 : (tensor<?x?xf32>, tensor<2xi32>)  -> tensor<?x?xf32>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
-// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-// CHECK-LABEL: @test_transpose_dyn_multiple_3d
-// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x?xf32>)
-func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
-  %0 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK-DAG: %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
-  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-  // CHECK-DAG: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
-  // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM2]], %[[DIM0]], %[[DIM1]]) : tensor<?x?x?xf32>
-  // CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?xf32>) {
-  // CHECK: ^bb0(%[[IN0:.*]]: f32, %[[OUT0:.*]]: f32):
-  // CHECK:   linalg.yield %[[IN0]] : f32
-  // CHECK: } -> tensor<?x?x?xf32>
-  %1 = "tosa.transpose"(%arg0, %0) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
-  return
-}
-
-// -----
-
 // CHECK-LABEL: @reduce_float
 // CHECK-SAME: [[ARG0:%.+]]: tensor<5x4xf32>
 func.func @reduce_float(%arg0: tensor<5x4xf32>) -> () {

From e775ba384efe31928bac796b2a4d388a1c298c5e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 2 Jan 2024 11:03:41 -0800
Subject: [PATCH 070/313] [SLP][NFC]Add some extra values to avoid constant
 expressions in the test.

---
 .../AArch64/reorder-fmuladd-crash.ll          | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
index b20dd0c41e174..0a68996410448 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-w32-windows-gnu | FileCheck %s
 
-define i32 @foo() {
+define i32 @foo(i32 %v1, double %v2) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    br label [[FOR_COND15_PREHEADER:%.*]]
 ; CHECK:       for.cond15.preheader:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
@@ -13,21 +16,24 @@ define i32 @foo() {
 ; CHECK-NEXT:    br label [[FOR_COND15:%.*]]
 ; CHECK:       for.end39:
 ; CHECK-NEXT:    switch i32 undef, label [[DO_BODY:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 1, label [[SW_BB195:%.*]]
+; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 1, label [[SW_BB195:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds [4 x [2 x double]], ptr undef, i32 0, i64 1, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 0x7FF8000000000000, double 0x7FF8000000000000, double 0x7FF8000000000000, double 0x7FF8000000000000>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> zeroinitializer, <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> <double poison, double undef>, double [[V2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]])
 ; CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 ; CHECK:       sw.bb195:
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
 ; CHECK:       do.body:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       sw.epilog:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP3]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP8]], [[SW_BB]] ]
 ; CHECK-NEXT:    ret i32 undef
 ; CHECK:       if.end.1:
 ; CHECK-NEXT:    br label [[FOR_COND15_1:%.*]]
@@ -36,7 +42,7 @@ define i32 @foo() {
 ;
 entry:
   %conv = sitofp i32 undef to double
-  %conv2 = sitofp i32 undef to double
+  %conv2 = sitofp i32 %v1 to double
   br label %for.cond15.preheader
 
 for.cond15.preheader:                             ; preds = %for.cond15.1, %entry
@@ -63,7 +69,7 @@ sw.bb:                                            ; preds = %for.end39
   %2 = load double, ptr %arrayidx51, align 8
   %arrayidx58 = getelementptr inbounds [4 x [2 x double]], ptr undef, i32 0, i64 1, i64 1
   %3 = load double, ptr %arrayidx58, align 8
-  %mul = fmul double undef, %conv2
+  %mul = fmul double %v2, %conv2
   %mul109 = fmul double undef, %conv
   %mul143 = fmul double %0, %mul
   %4 = call double @llvm.fmuladd.f64(double undef, double %conv2, double %mul143)

From 273aefec66cea947787f053adee25175f4168713 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Tue, 2 Jan 2024 11:57:00 -0800
Subject: [PATCH 071/313] [mlir][sparse] enable rt path for transpose COO
 (#76747)

COO has been supported for a while now on both
lib and codegen path. So enabling this test for
all paths and removing obsolete FIXME
---
 .../Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir         | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
index 95030fdf258bf..dba897334830a 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
@@ -17,8 +17,7 @@
 // DEFINE: %{env} =
 //--------------------------------------------------------------------------------------------------
 
-// FIXME: lib path does not support all of COO yet
-// R_U_N: %{compile} | %{run} | FileCheck %s
+// RUN: %{compile} | %{run} | FileCheck %s
 //
 // Do the same run, but now with direct IR generation.
 // REDEFINE: %{sparsifier_opts} = enable-runtime-library=false

From 619a0069240b0b55ab3e7dc280a8ca31e805747a Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Tue, 2 Jan 2024 20:01:54 +0000
Subject: [PATCH 072/313] [compiler-rt] eventfd api interception on freebsd.
 (#76564)

---
 .../lib/sanitizer_common/sanitizer_common_interceptors.inc   | 4 ++--
 .../lib/sanitizer_common/sanitizer_platform_interceptors.h   | 2 +-
 .../lib/sanitizer_common/sanitizer_platform_limits_freebsd.h | 2 ++
 .../lib/sanitizer_common/sanitizer_platform_limits_posix.h   | 2 ++
 compiler-rt/test/msan/{Linux => }/eventfd.cpp                | 5 ++++-
 5 files changed, 11 insertions(+), 4 deletions(-)
 rename compiler-rt/test/msan/{Linux => }/eventfd.cpp (75%)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index ba46707516971..77fa1b4965a7a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -6785,7 +6785,7 @@ INTERCEPTOR(SSIZE_T, sendto, int fd, void *buf, SIZE_T len, int flags,
 #endif
 
 #if SANITIZER_INTERCEPT_EVENTFD_READ_WRITE
-INTERCEPTOR(int, eventfd_read, int fd, u64 *value) {
+INTERCEPTOR(int, eventfd_read, int fd, __sanitizer_eventfd_t *value) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, eventfd_read, fd, value);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
@@ -6796,7 +6796,7 @@ INTERCEPTOR(int, eventfd_read, int fd, u64 *value) {
   }
   return res;
 }
-INTERCEPTOR(int, eventfd_write, int fd, u64 value) {
+INTERCEPTOR(int, eventfd_write, int fd, __sanitizer_eventfd_t value) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, eventfd_write, fd, value);
   if (fd >= 0) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index c1b416eb8340e..289ae661c343f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -463,7 +463,7 @@
   (SI_LINUX || SI_MAC || SI_WINDOWS || SI_FREEBSD || SI_NETBSD || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_RECV_RECVFROM SI_POSIX
 #define SANITIZER_INTERCEPT_SEND_SENDTO SI_POSIX
-#define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE SI_LINUX
+#define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE (SI_LINUX || SI_FREEBSD)
 
 #define SI_STAT_LINUX (SI_LINUX && __GLIBC_PREREQ(2, 33))
 #define SANITIZER_INTERCEPT_STAT                                        \
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
index b119f059007d8..43b8a38f39be1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
@@ -726,6 +726,8 @@ struct __sanitizer_cpuset {
 
 typedef struct __sanitizer_cpuset __sanitizer_cpuset_t;
 extern unsigned struct_cpuset_sz;
+
+typedef unsigned long long __sanitizer_eventfd_t;
 }  // namespace __sanitizer
 
 #  define CHECK_TYPE_SIZE(TYPE) \
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index 58244c9944a03..1ee26e0cbbb99 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -938,6 +938,8 @@ struct __sanitizer_cookie_io_functions_t {
   __sanitizer_cookie_io_seek seek;
   __sanitizer_cookie_io_close close;
 };
+
+typedef unsigned long long __sanitizer_eventfd_t;
 #endif
 
 #define IOC_NRBITS 8
diff --git a/compiler-rt/test/msan/Linux/eventfd.cpp b/compiler-rt/test/msan/eventfd.cpp
similarity index 75%
rename from compiler-rt/test/msan/Linux/eventfd.cpp
rename to compiler-rt/test/msan/eventfd.cpp
index 4399211258fff..638b883fd2c1c 100644
--- a/compiler-rt/test/msan/Linux/eventfd.cpp
+++ b/compiler-rt/test/msan/eventfd.cpp
@@ -1,4 +1,7 @@
-// RUN: %clangxx_msan -O0 %s -o %t && %run %t 2>&1
+/* RUN: %clangxx_msan -O0 %s -o %t && %run %t 2>&1
+
+   REQUIRES: target={{.*(linux|freebsd).*}}
+*/
 
 #include <assert.h>
 #include <sys/eventfd.h>

From 9191ac0bdb07643eefcc161c88b66d4e7058db9c Mon Sep 17 00:00:00 2001
From: Michael Holman <michhol@microsoft.com>
Date: Tue, 2 Jan 2024 12:18:21 -0800
Subject: [PATCH 073/313] [mlir] Consider mlir-linalg-ods-gen as a tablegen
 tool in build (#75093)

There is a bit of an issue with how `mlir-linalg-ods-yaml-gen` is
classified in the MLIR build. Due to it being a tool, it is excluded
from the install when using `-DLLVM_BUILD_TOOLS=OFF`. However, it is a
necessary component of the build, so it can cause build issues with
users of the installed LLVM, and so I think it should not be excluded.

It is a tablegen-like tool, so my solution is to reclassify it that way
in the build.
---
 llvm/cmake/modules/TableGen.cmake             | 12 ++++---
 mlir/CMakeLists.txt                           |  2 ++
 mlir/cmake/modules/AddMLIR.cmake              | 25 +++++++++++++
 mlir/cmake/modules/CMakeLists.txt             |  2 ++
 mlir/cmake/modules/MLIRConfig.cmake.in        |  1 +
 .../mlir/Dialect/Linalg/IR/CMakeLists.txt     | 35 +------------------
 mlir/lib/Dialect/Linalg/IR/CMakeLists.txt     |  1 +
 mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt | 28 +++++++--------
 8 files changed, 54 insertions(+), 52 deletions(-)

diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 1d18fdde2bb98..359d5217a22cb 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -165,14 +165,18 @@ function(add_public_tablegen_target target)
 endfunction()
 
 macro(add_tablegen target project)
-  cmake_parse_arguments(ADD_TABLEGEN "" "DESTINATION;EXPORT" "" ${ARGN})
+  cmake_parse_arguments(ADD_TABLEGEN "SKIP_COMPONENT_LINK" "DESTINATION;EXPORT" "" ${ARGN})
 
-  set(${target}_OLD_LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS})
-  set(LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS} TableGen)
+  if(NOT ADD_TABLEGEN_SKIP_COMPONENT_LINK)
+    set(${target}_OLD_LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS})
+    set(LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS} TableGen)
+  endif()
 
   add_llvm_executable(${target} DISABLE_LLVM_LINK_LLVM_DYLIB
     ${ADD_TABLEGEN_UNPARSED_ARGUMENTS})
-  set(LLVM_LINK_COMPONENTS ${${target}_OLD_LLVM_LINK_COMPONENTS})
+  if(NOT ADD_TABLEGEN_SKIP_COMPONENT_LINK)
+    set(LLVM_LINK_COMPONENTS ${${target}_OLD_LLVM_LINK_COMPONENTS})
+  endif()
 
   set(${project}_TABLEGEN_DEFAULT "${target}")
   if (LLVM_NATIVE_TOOL_DIR)
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 16ff950089734..dcc068e4097c5 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -184,6 +184,8 @@ add_subdirectory(tools/mlir-pdll)
 
 set(MLIR_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_TABLEGEN_TARGET "${MLIR_TABLEGEN_TARGET}" CACHE INTERNAL "")
+set(MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE "${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE}" CACHE INTERNAL "")
+set(MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_TARGET "${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_TARGET}" CACHE INTERNAL "")
 set(MLIR_PDLL_TABLEGEN_EXE "${MLIR_PDLL_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_PDLL_TABLEGEN_TARGET "${MLIR_PDLL_TABLEGEN_TARGET}" CACHE INTERNAL "")
 
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index 1d2ed748bc2f1..971ea7eceff4c 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -136,6 +136,31 @@ function(add_mlir_pdll_library target inputFile ofn)
   add_public_tablegen_target(${target})
 endfunction()
 
+# Declare a function to generate ODS with mlir-linalg-ods-yaml-gen
+function(add_linalg_ods_yaml_gen yaml_ast_file output_file)
+  set(YAML_AST_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/${yaml_ast_file})
+  set(GEN_ODS_FILE ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.yamlgen.td)
+  set(GEN_CPP_FILE ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.yamlgen.cpp.inc)
+  set_source_files_properties(
+    ${GEN_ODS_FILE}
+    PROPERTIES GENERATED TRUE)
+  set_source_files_properties(
+    ${GEN_CPP_FILE}
+    PROPERTIES GENERATED TRUE)
+  add_custom_command(
+    OUTPUT ${GEN_ODS_FILE} ${GEN_CPP_FILE}
+    COMMAND ${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE} ${YAML_AST_SOURCE} -o-ods-decl=${GEN_ODS_FILE} -o-impl=${GEN_CPP_FILE}
+    MAIN_DEPENDENCY
+    ${YAML_AST_SOURCE}
+    DEPENDS
+    ${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE}
+    ${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_TARGET}
+    ${LLVM_TARGET_DEPENDS})
+    
+  set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT} ${GEN_ODS_FILE} ${GEN_CPP_FILE}
+      PARENT_SCOPE)
+endfunction()
+
 # Declare a dialect in the include directory
 function(add_mlir_dialect dialect dialect_namespace)
   set(LLVM_TARGET_DEFINITIONS ${dialect}.td)
diff --git a/mlir/cmake/modules/CMakeLists.txt b/mlir/cmake/modules/CMakeLists.txt
index 8d2904ef46dfe..2b72beff89385 100644
--- a/mlir/cmake/modules/CMakeLists.txt
+++ b/mlir/cmake/modules/CMakeLists.txt
@@ -38,6 +38,7 @@ set(MLIR_CONFIG_INCLUDE_DIRS
   )
 # Refer to the best host mlir-tbgen, which might be a host-optimized version
 set(MLIR_CONFIG_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}")
+set(MLIR_CONFIG_LINALG_ODS_YAML_GEN_TABLEGEN_EXE "${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE}")
 set(MLIR_CONFIG_PDLL_TABLEGEN_EXE "${MLIR_PDLL_TABLEGEN_EXE}")
 
 configure_file(
@@ -76,6 +77,7 @@ set(MLIR_CONFIG_INCLUDE_DIRS
 # Ensure that we are using the installed mlir-tblgen. This might not be MLIR_TABLEGEN_EXE
 # if we're building with a host-optimized mlir-tblgen (with LLVM_OPTIMIZED_TABLEGEN).
 set(MLIR_CONFIG_TABLEGEN_EXE mlir-tblgen)
+set(MLIR_CONFIG_LINALG_ODS_YAML_GEN_TABLEGEN_EXE mlir-linalg-ods-yaml-gen)
 set(MLIR_CONFIG_PDLL_TABLEGEN_EXE mlir-pdll)
 
 configure_file(
diff --git a/mlir/cmake/modules/MLIRConfig.cmake.in b/mlir/cmake/modules/MLIRConfig.cmake.in
index d4da3cd98cce9..0922b9accea5d 100644
--- a/mlir/cmake/modules/MLIRConfig.cmake.in
+++ b/mlir/cmake/modules/MLIRConfig.cmake.in
@@ -10,6 +10,7 @@ set(MLIR_EXPORTED_TARGETS "@MLIR_EXPORTS@")
 set(MLIR_CMAKE_DIR "@MLIR_CONFIG_CMAKE_DIR@")
 set(MLIR_INCLUDE_DIRS "@MLIR_CONFIG_INCLUDE_DIRS@")
 set(MLIR_TABLEGEN_EXE "@MLIR_CONFIG_TABLEGEN_EXE@")
+set(MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE "@MLIR_CONFIG_LINALG_ODS_YAML_GEN_TABLEGEN_EXE@")
 set(MLIR_PDLL_TABLEGEN_EXE "@MLIR_CONFIG_PDLL_TABLEGEN_EXE@")
 set(MLIR_INSTALL_AGGREGATE_OBJECTS "@MLIR_INSTALL_AGGREGATE_OBJECTS@")
 set(MLIR_ENABLE_BINDINGS_PYTHON "@MLIR_ENABLE_BINDINGS_PYTHON@")
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
index f5d48b2ebcefe..c30665d4d118b 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
@@ -1,38 +1,5 @@
-# Declare a function to generate ODS with mlir-linalg-ods-yaml-gen
-function(add_linalg_ods_yaml_gen yaml_ast_file output_file)
-  set(YAML_AST_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/${yaml_ast_file})
-  set(GEN_ODS_FILE ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.yamlgen.td)
-  set(GEN_CPP_FILE ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.yamlgen.cpp.inc)
-  set_source_files_properties(
-    ${GEN_ODS_FILE}
-    PROPERTIES GENERATED TRUE)
-  set_source_files_properties(
-    ${GEN_CPP_FILE}
-    PROPERTIES GENERATED TRUE)
-  add_custom_command(
-    OUTPUT ${GEN_ODS_FILE} ${GEN_CPP_FILE}
-    COMMAND ${MLIR_LINALG_ODS_YAML_GEN_EXE} ${YAML_AST_SOURCE} -o-ods-decl=${GEN_ODS_FILE} -o-impl=${GEN_CPP_FILE}
-    MAIN_DEPENDENCY
-    ${YAML_AST_SOURCE}
-    DEPENDS
-    ${MLIR_LINALG_ODS_YAML_GEN_EXE}
-    ${MLIR_LINALG_ODS_YAML_GEN_TARGET})
-  add_custom_target(
-    MLIR${output_file}YamlIncGen
-    DEPENDS
-    ${MLIR_LINALG_ODS_YAML_GEN_EXE}
-    ${MLIR_LINALG_ODS_YAML_GEN_TARGET}
-    ${GEN_ODS_FILE} ${GEN_CPP_FILE})
-  list(APPEND LLVM_TARGET_DEPENDS ${GEN_ODS_FILE})
-  set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} PARENT_SCOPE)
-endfunction()
-
-# NOTE: LLVM_TARGET_DEPENDS gets picked up by tablegen targets to add file
-# level dependencies. This is gross but CMake requires depending on both
-# targets and generated files, and it must be done when the custom target is
-# declared (there is no way to add after the fact).
-set(LLVM_TARGET_DEPENDS "")
 add_linalg_ods_yaml_gen(LinalgNamedStructuredOps.yaml LinalgNamedStructuredOps)
+add_public_tablegen_target(MLIRLinalgNamedStructuredOpsYamlIncGen)
 
 # Provide a short name for all external dependency that needs to
 # include Linalg in ODS
diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
index f0ac1899bb02a..65b65014b23cf 100644
--- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_dialect_library(MLIRLinalgDialect
 
   DEPENDS
   MLIRLinalgInterfacesIncGen
+  MLIRLinalgNamedStructuredOpsYamlIncGen
   MLIRLinalgOpsAttributesIncGen
   MLIRLinalgOpsEnumsIncGen
   MLIRLinalgOpsIncGen
diff --git a/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt b/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
index 787a0bb35d7b1..2b2024da6409a 100644
--- a/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
+++ b/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
@@ -3,26 +3,26 @@ set(LLVM_LINK_COMPONENTS
   Support
   )
 
-# New mlir-linalg-ods-yaml-gen.
-add_mlir_tool(mlir-linalg-ods-yaml-gen
-  mlir-linalg-ods-yaml-gen.cpp
-)
-llvm_update_compile_flags(mlir-linalg-ods-yaml-gen)
-target_link_libraries(mlir-linalg-ods-yaml-gen PRIVATE
+set(LIBS
   MLIRIR
   MLIRSupport
   MLIRParser
-  )
+)
 
-setup_host_tool(mlir-linalg-ods-yaml-gen MLIR_LINALG_ODS_YAML_GEN MLIR_LINALG_ODS_YAML_GEN_EXE MLIR_LINALG_ODS_YAML_GEN_TARGET)
+# New mlir-linalg-ods-yaml-gen.
+add_tablegen(mlir-linalg-ods-yaml-gen MLIR_LINALG_ODS_YAML_GEN
+  DESTINATION "${MLIR_TOOLS_INSTALL_DIR}"
+  EXPORT MLIR
+  SKIP_COMPONENT_LINK
+  mlir-linalg-ods-yaml-gen.cpp
 
-if(NOT ${MLIR_LINALG_ODS_YAML_GEN_EXE} STREQUAL "mlir-linalg-ods-yaml-gen")
-  add_custom_target(mlir-linalg-ods-yaml-gen-host DEPENDS ${MLIR_LINALG_ODS_YAML_GEN_EXE})
+  DEPENDS
+  ${LIBS}
+)
+set_target_properties(mlir-linalg-ods-yaml-gen PROPERTIES FOLDER "Tablegenning")
+target_link_libraries(mlir-linalg-ods-yaml-gen PRIVATE ${LIBS})
 
-  if(NOT LLVM_BUILD_UTILS)
-    set_target_properties(mlir-linalg-ods-yaml-gen PROPERTIES EXCLUDE_FROM_ALL ON)
-  endif()
-endif()
+mlir_check_all_link_libraries(mlir-linalg-ods-yaml-gen)
 
 configure_file(
   update_core_linalg_named_ops.sh.in

From 1fa18fee72639944291b1300a6995abde9b9f51a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 2 Jan 2024 20:21:37 +0000
Subject: [PATCH 074/313] [ConstraintElim] Add test case for #76713

See https://github.com/llvm/llvm-project/issues/76713.
---
 .../ConstraintElimination/sub-nuw.ll          | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
index 0d90bc2fb3435..10b314c87f694 100644
--- a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
@@ -379,3 +379,40 @@ entry:
   %c = icmp ugt i64 %neg2, 0
   ret i1 %c
 }
+
+; FIXME: currently this incorrectly simplifies %c4 to true.
+define i1 @pr76713(i16 %i1, i16 %i3) {
+; CHECK-LABEL: @pr76713(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i16 [[I1:%.*]], -1
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i16 [[I1]], -3
+; CHECK-NEXT:    [[C3:%.*]] = icmp ult i16 [[I3:%.*]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C1]], [[C2]]
+; CHECK-NEXT:    [[AND_2:%.*]] = and i1 [[AND]], [[C3]]
+; CHECK-NEXT:    br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i16 [[I1]], -3
+; CHECK-NEXT:    [[ARRAYIDX_IDX:%.*]] = mul nuw nsw i16 [[I3]], 4
+; CHECK-NEXT:    [[I6:%.*]] = add nuw nsw i16 [[ARRAYIDX_IDX]], [[SUB]]
+; CHECK-NEXT:    ret i1 true
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %c1 = icmp ult i16 %i1, -1
+  %c2 = icmp uge i16 %i1, -3
+  %c3 = icmp ult i16 %i3, 2
+  %and = and i1 %c1, %c2
+  %and.2 = and i1 %and, %c3
+  br i1 %and, label %then, label %else
+
+then:
+  %sub = sub nuw nsw i16 %i1, -3
+  %arrayidx.idx = mul nuw nsw i16 %i3, 4
+  %i6 = add nuw nsw i16 %arrayidx.idx, %sub
+  %c4 = icmp ult i16 12, %i6
+  ret i1 %c4
+
+else:
+  ret i1 0
+}

From c91fab50410eb740ad1318164c820f9750f9a1bf Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 2 Jan 2024 20:27:01 +0000
Subject: [PATCH 075/313] Revert "[llvm-cxxfilt] Added the option --no-params
 (#75348)"

This reverts commit 71f8ea3062a6b0a190835853ee77e58469763b9e.

Test doesn't pass on mac. See comments on
https://github.com/llvm/llvm-project/pull/75348.
---
 libcxxabi/src/demangle/ItaniumDemangle.h     | 22 ++++---------
 llvm/docs/CommandGuide/llvm-cxxfilt.rst      |  4 ---
 llvm/include/llvm/Demangle/Demangle.h        |  5 ++-
 llvm/include/llvm/Demangle/ItaniumDemangle.h | 22 ++++---------
 llvm/lib/Demangle/Demangle.cpp               |  5 ++-
 llvm/lib/Demangle/ItaniumDemangle.cpp        |  4 +--
 llvm/test/tools/llvm-cxxfilt/no-params.test  | 34 --------------------
 llvm/tools/llvm-cxxfilt/Opts.td              |  2 --
 llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp     | 10 ++----
 9 files changed, 21 insertions(+), 87 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-cxxfilt/no-params.test

diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 5a53a18bcc5fe..90f25519fad1c 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -2794,7 +2794,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseClassEnumType();
   Node *parseQualifiedType();
 
-  Node *parseEncoding(bool ParseParams = true);
+  Node *parseEncoding();
   bool parseCallOffset();
   Node *parseSpecialName();
 
@@ -2911,7 +2911,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseDestructorName();
 
   /// Top-level entry point into the parser.
-  Node *parse(bool ParseParams = true);
+  Node *parse();
 };
 
 const char* parse_discriminator(const char* first, const char* last);
@@ -5405,7 +5405,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
 //            ::= <data name>
 //            ::= <special-name>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   // The template parameters of an encoding are unrelated to those of the
   // enclosing context.
   SaveTemplateParams SaveTemplateParamsScope(this);
@@ -5431,16 +5431,6 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
   if (IsEndOfEncoding())
     return Name;
 
-  // ParseParams may be false at the top level only, when called from parse().
-  // For example in the mangled name _Z3fooILZ3BarEET_f, ParseParams may be
-  // false when demangling 3fooILZ3BarEET_f but is always true when demangling
-  // 3Bar.
-  if (!ParseParams) {
-    while (consume())
-      ;
-    return Name;
-  }
-
   Node *Attrs = nullptr;
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
@@ -5905,9 +5895,9 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
+Node *AbstractManglingParser<Derived, Alloc>::parse() {
   if (consumeIf("_Z") || consumeIf("__Z")) {
-    Node *Encoding = getDerived().parseEncoding(ParseParams);
+    Node *Encoding = getDerived().parseEncoding();
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5921,7 +5911,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
   }
 
   if (consumeIf("___Z") || consumeIf("____Z")) {
-    Node *Encoding = getDerived().parseEncoding(ParseParams);
+    Node *Encoding = getDerived().parseEncoding();
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
diff --git a/llvm/docs/CommandGuide/llvm-cxxfilt.rst b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
index 0933f0b5bed87..3f7deb1719511 100644
--- a/llvm/docs/CommandGuide/llvm-cxxfilt.rst
+++ b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
@@ -48,10 +48,6 @@ OPTIONS
 
   Print a summary of command line options.
 
-.. option:: --no-params, -p
-
-  Do not demangle function parameters or return types.
-
 .. option:: --no-strip-underscore, -n
 
   Do not strip a leading underscore. This is the default for all platforms
diff --git a/llvm/include/llvm/Demangle/Demangle.h b/llvm/include/llvm/Demangle/Demangle.h
index fe129603c0785..70cfc1418f0c7 100644
--- a/llvm/include/llvm/Demangle/Demangle.h
+++ b/llvm/include/llvm/Demangle/Demangle.h
@@ -32,7 +32,7 @@ enum : int {
 /// Returns a non-NULL pointer to a NUL-terminated C style string
 /// that should be explicitly freed, if successful. Otherwise, may return
 /// nullptr if mangled_name is not a valid mangling or is nullptr.
-char *itaniumDemangle(std::string_view mangled_name, bool ParseParams = true);
+char *itaniumDemangle(std::string_view mangled_name);
 
 enum MSDemangleFlags {
   MSDF_None = 0,
@@ -68,8 +68,7 @@ char *dlangDemangle(std::string_view MangledName);
 std::string demangle(std::string_view MangledName);
 
 bool nonMicrosoftDemangle(std::string_view MangledName, std::string &Result,
-                          bool CanHaveLeadingDot = true,
-                          bool ParseParams = true);
+                          bool CanHaveLeadingDot = true);
 
 /// "Partial" demangler. This supports demangling a string into an AST
 /// (typically an intermediate stage in itaniumDemangle) and querying certain
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 06956f47c1f0b..e0ff035d47cfb 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -2793,7 +2793,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseClassEnumType();
   Node *parseQualifiedType();
 
-  Node *parseEncoding(bool ParseParams = true);
+  Node *parseEncoding();
   bool parseCallOffset();
   Node *parseSpecialName();
 
@@ -2910,7 +2910,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseDestructorName();
 
   /// Top-level entry point into the parser.
-  Node *parse(bool ParseParams = true);
+  Node *parse();
 };
 
 const char* parse_discriminator(const char* first, const char* last);
@@ -5404,7 +5404,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
 //            ::= <data name>
 //            ::= <special-name>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   // The template parameters of an encoding are unrelated to those of the
   // enclosing context.
   SaveTemplateParams SaveTemplateParamsScope(this);
@@ -5430,16 +5430,6 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
   if (IsEndOfEncoding())
     return Name;
 
-  // ParseParams may be false at the top level only, when called from parse().
-  // For example in the mangled name _Z3fooILZ3BarEET_f, ParseParams may be
-  // false when demangling 3fooILZ3BarEET_f but is always true when demangling
-  // 3Bar.
-  if (!ParseParams) {
-    while (consume())
-      ;
-    return Name;
-  }
-
   Node *Attrs = nullptr;
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
@@ -5904,9 +5894,9 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
+Node *AbstractManglingParser<Derived, Alloc>::parse() {
   if (consumeIf("_Z") || consumeIf("__Z")) {
-    Node *Encoding = getDerived().parseEncoding(ParseParams);
+    Node *Encoding = getDerived().parseEncoding();
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5920,7 +5910,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
   }
 
   if (consumeIf("___Z") || consumeIf("____Z")) {
-    Node *Encoding = getDerived().parseEncoding(ParseParams);
+    Node *Encoding = getDerived().parseEncoding();
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
diff --git a/llvm/lib/Demangle/Demangle.cpp b/llvm/lib/Demangle/Demangle.cpp
index 117b849d1c784..83f3cdc88c01e 100644
--- a/llvm/lib/Demangle/Demangle.cpp
+++ b/llvm/lib/Demangle/Demangle.cpp
@@ -47,8 +47,7 @@ static bool isRustEncoding(std::string_view S) { return starts_with(S, "_R"); }
 static bool isDLangEncoding(std::string_view S) { return starts_with(S, "_D"); }
 
 bool llvm::nonMicrosoftDemangle(std::string_view MangledName,
-                                std::string &Result, bool CanHaveLeadingDot,
-                                bool ParseParams) {
+                                std::string &Result, bool CanHaveLeadingDot) {
   char *Demangled = nullptr;
 
   // Do not consider the dot prefix as part of the demangled symbol name.
@@ -58,7 +57,7 @@ bool llvm::nonMicrosoftDemangle(std::string_view MangledName,
   }
 
   if (isItaniumEncoding(MangledName))
-    Demangled = itaniumDemangle(MangledName, ParseParams);
+    Demangled = itaniumDemangle(MangledName);
   else if (isRustEncoding(MangledName))
     Demangled = rustDemangle(MangledName);
   else if (isDLangEncoding(MangledName))
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index 5c21b06a1d095..e3f208f0adf8d 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -366,13 +366,13 @@ class DefaultAllocator {
 
 using Demangler = itanium_demangle::ManglingParser<DefaultAllocator>;
 
-char *llvm::itaniumDemangle(std::string_view MangledName, bool ParseParams) {
+char *llvm::itaniumDemangle(std::string_view MangledName) {
   if (MangledName.empty())
     return nullptr;
 
   Demangler Parser(MangledName.data(),
                    MangledName.data() + MangledName.length());
-  Node *AST = Parser.parse(ParseParams);
+  Node *AST = Parser.parse();
   if (!AST)
     return nullptr;
 
diff --git a/llvm/test/tools/llvm-cxxfilt/no-params.test b/llvm/test/tools/llvm-cxxfilt/no-params.test
deleted file mode 100644
index 17cbbbe2242bc..0000000000000
--- a/llvm/test/tools/llvm-cxxfilt/no-params.test
+++ /dev/null
@@ -1,34 +0,0 @@
-RUN: llvm-cxxfilt             _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-PARAMS
-RUN: llvm-cxxfilt          -p _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-NO-PARAMS --match-full-lines
-RUN: llvm-cxxfilt --no-params _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-NO-PARAMS --match-full-lines
-
-# Check that -p or --no-params flag omits function parameters and the return
-# type.
-
-CHECK-PARAMS: Bar foo<Bar>(float)
-CHECK-NO-PARAMS: foo<Bar>
-
-# Check that only the top-level function is impacted by the switch, and that
-# nested function types in the encoding (e.g. where a function type is being
-# used as a template parameter) still include their parameters.
-#
-# template <typename T> T foo(double);
-# typedef char (*F)(float);
-# F foo<F>(double)
-
-CHECK-PARAMS: char (*foo<char (*)(float)>(double))(float)
-CHECK-NO-PARAMS: foo<char (*)(float)>
-
-# Use an invalid mangled name broken in the function parameters to check how -p
-# or --no-params flag works. If the option is given we should be able to
-# demangle the function name just fine. If it is not given, demangling will fail
-# because of the invalid params.
-
-CHECK-PARAMS: _ZN1f2baC2ERKNS_2baIT_EE
-CHECK-NO-PARAMS: f::ba::ba
-
-# Check that a vendor specific suffix is also omitted when --no-params is
-# specified. This matches c++filt's behaviour.
-
-CHECK-PARAMS: foo() (.123)
-CHECK-NO-PARAMS: foo
diff --git a/llvm/tools/llvm-cxxfilt/Opts.td b/llvm/tools/llvm-cxxfilt/Opts.td
index 034cb267aab80..f652a1a7f88bb 100644
--- a/llvm/tools/llvm-cxxfilt/Opts.td
+++ b/llvm/tools/llvm-cxxfilt/Opts.td
@@ -17,7 +17,6 @@ multiclass Eq<string name, string help> {
 def help : FF<"help", "Display this help">;
 defm strip_underscore : BB<"strip-underscore", "Strip the leading underscore", "Don't strip the leading underscore">;
 def types : FF<"types", "Attempt to demangle types as well as function names">;
-def no_params : FF<"no-params", "Skip function parameters and return types">;
 def version : FF<"version", "Display the version">;
 
 defm : Eq<"format", "Specify mangling format. Currently ignored because only 'gnu' is supported">;
@@ -26,5 +25,4 @@ def : F<"s", "Alias for --format">;
 def : F<"_", "Alias for --strip-underscore">, Alias<strip_underscore>;
 def : F<"h", "Alias for --help">, Alias<help>;
 def : F<"n", "Alias for --no-strip-underscore">, Alias<no_strip_underscore>;
-def : F<"p", "Alias for --no-params">, Alias<no_params>;
 def : F<"t", "Alias for --types">, Alias<types>;
diff --git a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
index 26a1f2f4afebd..4b9d88a650666 100644
--- a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
+++ b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
@@ -54,7 +54,6 @@ class CxxfiltOptTable : public opt::GenericOptTable {
 };
 } // namespace
 
-static bool ParseParams;
 static bool StripUnderscore;
 static bool Types;
 
@@ -75,19 +74,18 @@ static std::string demangle(const std::string &Mangled) {
   }
 
   std::string Result;
-  if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot,
-                           ParseParams))
+  if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot))
     return Result;
 
   std::string Prefix;
   char *Undecorated = nullptr;
 
   if (Types)
-    Undecorated = itaniumDemangle(DecoratedStr, ParseParams);
+    Undecorated = itaniumDemangle(DecoratedStr);
 
   if (!Undecorated && starts_with(DecoratedStr, "__imp_")) {
     Prefix = "import thunk for ";
-    Undecorated = itaniumDemangle(DecoratedStr.substr(6), ParseParams);
+    Undecorated = itaniumDemangle(DecoratedStr.substr(6));
   }
 
   Result = Undecorated ? Prefix + Undecorated : Mangled;
@@ -175,8 +173,6 @@ int llvm_cxxfilt_main(int argc, char **argv, const llvm::ToolContext &) {
   else
     StripUnderscore = Triple(sys::getProcessTriple()).isOSBinFormatMachO();
 
-  ParseParams = !Args.hasArg(OPT_no_params);
-
   Types = Args.hasArg(OPT_types);
 
   std::vector<std::string> Decorated = Args.getAllArgValues(OPT_INPUT);

From 8c7dfafa0a5cd2fb001b2f59243b63e1ce2bfafa Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 2 Jan 2024 20:51:21 +0000
Subject: [PATCH 076/313] [ConstraintElim] Add extra tests with chained subs.

---
 .../ConstraintElimination/sub-nuw.ll          | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
index 10b314c87f694..ba468789af5c6 100644
--- a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
@@ -416,3 +416,86 @@ then:
 else:
   ret i1 0
 }
+
+; FIXME: Currently gets mis-compiled.
+define void @sub_nuw_chained_positive_constants(i16 %a) {
+; CHECK-LABEL: @sub_nuw_chained_positive_constants(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nuw i16 [[A:%.*]], 10
+; CHECK-NEXT:    [[SUB2:%.*]] = sub nuw i16 [[SUB1]], 20
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 [[SUB2]], 90
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i16 [[A]], 120
+; CHECK-NEXT:    call void @use(i1 [[C_2]])
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 121
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    ret void
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[C_4:%.*]] = icmp ugt i16 [[A]], 120
+; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    [[C_5:%.*]] = icmp ugt i16 [[A]], 121
+; CHECK-NEXT:    call void @use(i1 [[C_5]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub1 = sub nuw i16 %a, 10
+  %sub2 = sub nuw i16 %sub1, 20
+  %c.1 = icmp ugt i16 %sub2, 90
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %c.2 = icmp ugt i16 %a, 120
+  call void @use(i1 %c.2)
+  %c.3 = icmp ugt i16 %a, 121
+  call void @use(i1 %c.3)
+  ret void
+
+exit.2:
+  %c.4 = icmp ugt i16 %a, 120
+  call void @use(i1 %c.4)
+  %c.5 = icmp ugt i16 %a, 121
+  call void @use(i1 %c.5)
+  ret void
+}
+
+; FIXME: Currently gets mis-compiled.
+define void @sub_nuw_chained_negative_constants(i8 %a) {
+; CHECK-LABEL: @sub_nuw_chained_negative_constants(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nuw i8 [[A:%.*]], 10
+; CHECK-NEXT:    [[SUB2:%.*]] = sub nuw i8 [[SUB1]], -126
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[SUB2]], 20
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i8 [[A]], -96
+; CHECK-NEXT:    call void @use(i1 [[C_2]])
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i8 [[A]], -95
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    ret void
+; CHECK:       exit.2:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub1 = sub nuw i8 %a, 10
+  %sub2 = sub nuw i8 %sub1, 130
+  %c.1 = icmp ugt i8 %sub2, 20
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %c.2 = icmp ugt i8 %a, 160
+  call void @use(i1 %c.2)
+  %c.3 = icmp ugt i8 %a, 161
+  call void @use(i1 %c.3)
+  ret void
+
+
+exit.2:
+  %c.4 = icmp ugt i8 %a, 160
+  call void @use(i1 %c.4)
+  %c.5 = icmp ugt i8 %a, 161
+  call void @use(i1 %c.5)
+  ret void
+}

From c0345b4648608b44071bbb6b012b24bd07c3d29e Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 2 Jan 2024 16:14:22 -0500
Subject: [PATCH 077/313] [mlir][gpu] Add subgroup_reduce to shuffle lowering
 (#76530)

This supports both the scalar and the vector multi-reduction cases.
---
 .../mlir/Dialect/GPU/Transforms/Passes.h      |   7 +
 .../mlir/Dialect/GPU/Transforms/Utils.h       |   5 +
 mlir/lib/Dialect/GPU/CMakeLists.txt           |   1 +
 .../GPU/Transforms/AllReduceLowering.cpp      |  27 ---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 174 +++++++++++++++-
 mlir/lib/Dialect/GPU/Transforms/Utils.cpp     |  44 ++++
 .../Dialect/GPU/subgroup-redule-lowering.mlir | 190 ++++++++++++++----
 mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp  |  24 ++-
 8 files changed, 408 insertions(+), 64 deletions(-)
 create mode 100644 mlir/lib/Dialect/GPU/Transforms/Utils.cpp

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 6c5bf75d21247..5885facd07541 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -70,6 +70,13 @@ void populateGpuBreakDownSubgrupReducePatterns(RewritePatternSet &patterns,
                                                unsigned maxShuffleBitwidth = 32,
                                                PatternBenefit benefit = 1);
 
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `gpu.shuffle`
+/// ops over `shuffleBitwidth` scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Uses the butterfly shuffle algorithm.
+void populateGpuLowerSubgroupReduceToShufflePattenrs(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
+
 /// Collect all patterns to rewrite ops within the GPU dialect.
 inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
   populateGpuAllReducePatterns(patterns);
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h
index a426bee7686db..f25c506fd638d 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h
@@ -13,6 +13,8 @@
 #ifndef MLIR_DIALECT_GPU_TRANSFORMS_UTILS_H_
 #define MLIR_DIALECT_GPU_TRANSFORMS_UTILS_H_
 
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Support/LLVM.h"
 
 #include <string>
@@ -28,6 +30,9 @@ class LaunchOp;
 
 /// Returns the default annotation name for GPU binary blobs.
 std::string getDefaultGpuBinaryAnnotation();
+
+/// Returns the matching vector combining kind.
+vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode);
 } // namespace gpu
 
 /// Get a gpu.func created from outlining the region of a gpu.launch op with the
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 8383e06e6d247..8f289ce9452e8 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -64,6 +64,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   Transforms/ShuffleRewriter.cpp
   Transforms/SPIRVAttachTarget.cpp
   Transforms/SubgroupReduceLowering.cpp
+  Transforms/Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
diff --git a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
index 608d801ee9bbb..a75598afe8c72 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
@@ -27,33 +27,6 @@ using namespace mlir;
 
 namespace {
 
-static vector::CombiningKind
-convertReductionKind(gpu::AllReduceOperation mode) {
-  switch (mode) {
-#define MAP_CASE(X)                                                            \
-  case gpu::AllReduceOperation::X:                                             \
-    return vector::CombiningKind::X
-
-    MAP_CASE(ADD);
-    MAP_CASE(MUL);
-    MAP_CASE(MINUI);
-    MAP_CASE(MINSI);
-    MAP_CASE(MINNUMF);
-    MAP_CASE(MAXSI);
-    MAP_CASE(MAXUI);
-    MAP_CASE(MAXNUMF);
-    MAP_CASE(AND);
-    MAP_CASE(OR);
-    MAP_CASE(XOR);
-    MAP_CASE(MINIMUMF);
-    MAP_CASE(MAXIMUMF);
-
-#undef MAP_CASE
-  }
-
-  llvm_unreachable("Vector and GPU reduction kinds should match 1:1");
-}
-
 struct GpuAllReduceRewriter {
   using AccumulatorFactory = std::function<Value(Value, Value)>;
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 61edce5e2a086..b00c65c4c6248 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -13,13 +13,17 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/Transforms/Utils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
+#include <cstdint>
 
 using namespace mlir;
 
@@ -58,7 +62,7 @@ struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> {
     unsigned elemBitwidth = elemTy.getIntOrFloatBitWidth();
     if (elemBitwidth >= maxShuffleBitwidth)
       return rewriter.notifyMatchFailure(
-          op, llvm::formatv("element type too large {0}, cannot break down "
+          op, llvm::formatv("element type too large ({0}), cannot break down "
                             "into vectors of bitwidth {1} or less",
                             elemBitwidth, maxShuffleBitwidth));
 
@@ -139,6 +143,167 @@ struct ScalarizeSingleElementReduce final
   }
 };
 
+/// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn`
+/// and `unpackFn` to convert to the native shuffle type and to the reduction
+/// type, respectively. For example, with `input` of type `f16`, `packFn` could
+/// build ops to cast the value to `i32` to perform shuffles, while `unpackFn`
+/// would cast it back to `f16` to perform arithmetic reduction on. Assumes that
+/// the subgroup is `subgroupSize` lanes wide and reduces across all of them.
+static Value createSubgroupShuffleReduction(
+    OpBuilder &builder, Location loc, Value input, gpu::AllReduceOperation mode,
+    unsigned subgroupSize, function_ref<Value(Value)> packFn,
+    function_ref<Value(Value)> unpackFn) {
+  assert(llvm::isPowerOf2_32(subgroupSize));
+  // Lane value always stays in the original type. We use it to perform arith
+  // reductions.
+  Value laneVal = input;
+  // Parallel reduction using butterfly shuffles.
+  for (unsigned i = 1; i < subgroupSize; i <<= 1) {
+    Value shuffled = builder
+                         .create<gpu::ShuffleOp>(loc, packFn(laneVal), i,
+                                                 /*width=*/subgroupSize,
+                                                 /*mode=*/gpu::ShuffleMode::XOR)
+                         .getShuffleResult();
+    laneVal = vector::makeArithReduction(builder, loc,
+                                         gpu::convertReductionKind(mode),
+                                         laneVal, unpackFn(shuffled));
+    assert(laneVal.getType() == input.getType());
+  }
+
+  return laneVal;
+}
+
+/// Lowers scalar gpu subgroup reductions to a series of shuffles.
+struct ScalarSubgroupReduceToShuffles final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
+                                 unsigned shuffleBitwidth,
+                                 PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        shuffleBitwidth(shuffleBitwidth) {}
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    Type valueTy = op.getType();
+    unsigned elemBitwidth =
+        getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth();
+    if (!valueTy.isIntOrFloat() || elemBitwidth > shuffleBitwidth)
+      return rewriter.notifyMatchFailure(
+          op, "value type is not a compatible scalar");
+
+    Location loc = op.getLoc();
+    // Since this is already a native shuffle scalar, no packing is necessary.
+    if (elemBitwidth == shuffleBitwidth) {
+      auto identityFn = [](Value v) { return v; };
+      rewriter.replaceOp(op, createSubgroupShuffleReduction(
+                                 rewriter, loc, op.getValue(), op.getOp(),
+                                 subgroupSize, identityFn, identityFn));
+      return success();
+    }
+
+    auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth);
+    auto equivIntType = rewriter.getIntegerType(elemBitwidth);
+    auto packFn = [loc, &rewriter, equivIntType,
+                   shuffleIntType](Value unpackedVal) -> Value {
+      auto asInt =
+          rewriter.create<arith::BitcastOp>(loc, equivIntType, unpackedVal);
+      return rewriter.create<arith::ExtUIOp>(loc, shuffleIntType, asInt);
+    };
+    auto unpackFn = [loc, &rewriter, equivIntType,
+                     valueTy](Value packedVal) -> Value {
+      auto asInt =
+          rewriter.create<arith::TruncIOp>(loc, equivIntType, packedVal);
+      return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
+    };
+
+    rewriter.replaceOp(op, createSubgroupShuffleReduction(
+                               rewriter, loc, op.getValue(), op.getOp(),
+                               subgroupSize, packFn, unpackFn));
+    return success();
+  }
+
+private:
+  unsigned subgroupSize = 0;
+  unsigned shuffleBitwidth = 0;
+};
+
+/// Lowers vector gpu subgroup reductions to a series of shuffles.
+struct VectorSubgroupReduceToShuffles final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  VectorSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
+                                 unsigned shuffleBitwidth,
+                                 PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        shuffleBitwidth(shuffleBitwidth) {}
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    auto vecTy = dyn_cast<VectorType>(op.getType());
+    if (!vecTy)
+      return rewriter.notifyMatchFailure(op, "value type is not a vector");
+
+    unsigned vecBitwidth =
+        vecTy.getNumElements() * vecTy.getElementTypeBitWidth();
+    if (vecBitwidth > shuffleBitwidth)
+      return rewriter.notifyMatchFailure(
+          op,
+          llvm::formatv("vector type bitwidth too large ({0}), cannot lower "
+                        "to shuffles of size {1}",
+                        vecBitwidth, shuffleBitwidth));
+
+    unsigned elementsPerShuffle =
+        shuffleBitwidth / vecTy.getElementTypeBitWidth();
+    if (elementsPerShuffle * vecTy.getElementTypeBitWidth() != shuffleBitwidth)
+      return rewriter.notifyMatchFailure(
+          op, "shuffle bitwidth is not a multiple of the element bitwidth");
+
+    Location loc = op.getLoc();
+
+    // If the reduced type is smaller than the native shuffle size, extend it,
+    // perform the shuffles, and extract at the end.
+    auto extendedVecTy = VectorType::get(
+        static_cast<int64_t>(elementsPerShuffle), vecTy.getElementType());
+    Value extendedInput = op.getValue();
+    if (vecBitwidth < shuffleBitwidth) {
+      auto zero = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getZeroAttr(extendedVecTy));
+      extendedInput = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, extendedInput, zero, /*offsets=*/0, /*strides=*/1);
+    }
+
+    auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth);
+    auto shuffleVecType = VectorType::get(1, shuffleIntType);
+
+    auto packFn = [loc, &rewriter, shuffleVecType](Value unpackedVal) -> Value {
+      auto asIntVec =
+          rewriter.create<vector::BitCastOp>(loc, shuffleVecType, unpackedVal);
+      return rewriter.create<vector::ExtractOp>(loc, asIntVec, 0);
+    };
+    auto unpackFn = [loc, &rewriter, shuffleVecType,
+                     extendedVecTy](Value packedVal) -> Value {
+      auto asIntVec =
+          rewriter.create<vector::BroadcastOp>(loc, shuffleVecType, packedVal);
+      return rewriter.create<vector::BitCastOp>(loc, extendedVecTy, asIntVec);
+    };
+
+    Value res =
+        createSubgroupShuffleReduction(rewriter, loc, extendedInput, op.getOp(),
+                                       subgroupSize, packFn, unpackFn);
+
+    if (vecBitwidth < shuffleBitwidth) {
+      res = rewriter.create<vector::ExtractStridedSliceOp>(
+          loc, res, /*offsets=*/0, /*sizes=*/vecTy.getNumElements(),
+          /*strides=*/1);
+    }
+
+    rewriter.replaceOp(op, res);
+    return success();
+  }
+
+private:
+  unsigned subgroupSize = 0;
+  unsigned shuffleBitwidth = 0;
+};
 } // namespace
 
 void mlir::populateGpuBreakDownSubgrupReducePatterns(
@@ -148,3 +313,10 @@ void mlir::populateGpuBreakDownSubgrupReducePatterns(
                                         maxShuffleBitwidth, benefit);
   patterns.add<ScalarizeSingleElementReduce>(patterns.getContext(), benefit);
 }
+
+void mlir::populateGpuLowerSubgroupReduceToShufflePattenrs(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToShuffles, VectorSubgroupReduceToShuffles>(
+      patterns.getContext(), subgroupSize, shuffleBitwidth, benefit);
+}
diff --git a/mlir/lib/Dialect/GPU/Transforms/Utils.cpp b/mlir/lib/Dialect/GPU/Transforms/Utils.cpp
new file mode 100644
index 0000000000000..e91aa18128c7b
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/Utils.cpp
@@ -0,0 +1,44 @@
+//===- Utils.cpp - GPU transforms utils -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements GPU dialect transforms utils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace mlir::gpu {
+
+vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode) {
+  switch (mode) {
+#define MAP_CASE(X)                                                            \
+  case gpu::AllReduceOperation::X:                                             \
+    return vector::CombiningKind::X
+
+    MAP_CASE(ADD);
+    MAP_CASE(MUL);
+    MAP_CASE(MINUI);
+    MAP_CASE(MINSI);
+    MAP_CASE(MINNUMF);
+    MAP_CASE(MAXSI);
+    MAP_CASE(MAXUI);
+    MAP_CASE(MAXNUMF);
+    MAP_CASE(AND);
+    MAP_CASE(OR);
+    MAP_CASE(XOR);
+    MAP_CASE(MINIMUMF);
+    MAP_CASE(MAXIMUMF);
+
+#undef MAP_CASE
+  }
+
+  llvm_unreachable("Vector and GPU reduction kinds should match 1:1");
+}
+
+} // namespace mlir::gpu
diff --git a/mlir/test/Dialect/GPU/subgroup-redule-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-redule-lowering.mlir
index b7146071bf2fd..f04a01ffe75d3 100644
--- a/mlir/test/Dialect/GPU/subgroup-redule-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-redule-lowering.mlir
@@ -1,71 +1,191 @@
-// RUN: mlir-opt --allow-unregistered-dialect --test-gpu-subgroup-reduce-lowering %s | FileCheck %s
+// RUN: mlir-opt  --allow-unregistered-dialect \
+// RUN:   --test-gpu-subgroup-reduce-lowering %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-SUB
 
-// CHECK: gpu.module @kernels {
+// RUN: mlir-opt --allow-unregistered-dialect \
+// RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles" %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-SHFL
+
+// CHECK-SUB:  gpu.module @kernels {
+// CHECK-SHFL: gpu.module @kernels {
 gpu.module @kernels {
 
-  // CHECK-LABEL: gpu.func @kernel0(
-  // CHECK-SAME: %[[ARG0:.+]]: vector<5xf16>)
+  // CHECK-SUB-LABEL:  gpu.func @kernel0(
+  // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<5xf16>)
+  //
+  // CHECK-SHFL-LABEL: gpu.func @kernel0(
   gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
-    // CHECK: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
-    // CHECK: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
-    // CHECK: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (vector<2xf16>) -> vector<2xf16>
-    // CHECK: %[[V0:.+]] = vector.insert_strided_slice %[[R0]], %[[VZ]] {offsets = [0], strides = [1]} : vector<2xf16> into vector<5xf16>
-    // CHECK: %[[E1:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [2], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
-    // CHECK: %[[R1:.+]] = gpu.subgroup_reduce add %[[E1]] : (vector<2xf16>) -> vector<2xf16>
-    // CHECK: %[[V1:.+]] = vector.insert_strided_slice %[[R1]], %[[V0]] {offsets = [2], strides = [1]} : vector<2xf16> into vector<5xf16>
-    // CHECK: %[[E2:.+]] = vector.extract %[[ARG0]][4] : f16 from vector<5xf16>
-    // CHECK: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16
-    // CHECK: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
-    // CHECK: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
+    // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
+    // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
+    // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (vector<2xf16>) -> vector<2xf16>
+    // CHECK-SUB: %[[V0:.+]] = vector.insert_strided_slice %[[R0]], %[[VZ]] {offsets = [0], strides = [1]} : vector<2xf16> into vector<5xf16>
+    // CHECK-SUB: %[[E1:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [2], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
+    // CHECK-SUB: %[[R1:.+]] = gpu.subgroup_reduce add %[[E1]] : (vector<2xf16>) -> vector<2xf16>
+    // CHECK-SUB: %[[V1:.+]] = vector.insert_strided_slice %[[R1]], %[[V0]] {offsets = [2], strides = [1]} : vector<2xf16> into vector<5xf16>
+    // CHECK-SUB: %[[E2:.+]] = vector.extract %[[ARG0]][4] : f16 from vector<5xf16>
+    // CHECK-SUB: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16
+    // CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
+    // CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum0) : (vector<5xf16>) -> ()
 
-
-    // CHECK-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
-    // CHECK: "test.consume"
+    // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
+    // CHECK-SUB: "test.consume"
     %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum1) : (vector<5xf16>) -> ()
 
-    // CHECK: gpu.return
+    // CHECK-SUB: gpu.return
     gpu.return
   }
 
-  // CHECK-LABEL: gpu.func @kernel1(
-  // CHECK-SAME: %[[ARG0:.+]]: vector<1xf32>)
+  // CHECK-SUB-LABEL:  gpu.func @kernel1(
+  // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<1xf32>)
+  //
+  // CHECK-SHFL-LABEL: gpu.func @kernel1(
   gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
-    // CHECK: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
-    // CHECK: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
-    // CHECK: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
-    // CHECK: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
+    // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
+    // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
+    // CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
+    // CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum0) : (vector<1xf32>) -> ()
 
-    // CHECK: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
-    // CHECK: "test.consume"
+    // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
+    // CHECK-SUB: "test.consume"
     %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum1) : (vector<1xf32>) -> ()
 
-    // CHECK: gpu.return
+    // CHECK-SUB: gpu.return
     gpu.return
   }
 
   // These vectors fit the native shuffle size and should not be broken down.
   //
-  // CHECK-LABEL: gpu.func @kernel2(
-  // CHECK-SAME: %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>)
+  // CHECK-SUB-LABEL:  gpu.func @kernel2(
+  // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>)
+  //
+  // CHECK-SHFL-LABEL: gpu.func @kernel2(
   gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
-    // CHECK: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
-    // CHECK: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
+    // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
+    // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<3xi8>) -> (vector<3xi8>)
     "test.consume"(%sum0) : (vector<3xi8>) -> ()
 
-    // CHECK: %[[R1:.+]] = gpu.subgroup_reduce add %[[ARG1]] : (vector<4xi8>) -> vector<4xi8>
-    // CHECK: "test.consume"(%[[R1]]) : (vector<4xi8>) -> ()
+    // CHECK-SUB: %[[R1:.+]] = gpu.subgroup_reduce add %[[ARG1]] : (vector<4xi8>) -> vector<4xi8>
+    // CHECK-SUB: "test.consume"(%[[R1]]) : (vector<4xi8>) -> ()
     %sum1 = gpu.subgroup_reduce add %arg1 : (vector<4xi8>) -> (vector<4xi8>)
     "test.consume"(%sum1) : (vector<4xi8>) -> ()
 
-    // CHECK: gpu.return
+    // CHECK-SUB: gpu.return
+    gpu.return
+  }
+
+  // CHECK-SHFL-LABEL: gpu.func @kernel3(
+  // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
+  gpu.func @kernel3(%arg0: i32) kernel {
+    // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
+    // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
+    // CHECK-SHFL-DAG: %[[C4:.+]] = arith.constant 4 : i32
+    // CHECK-SHFL-DAG: %[[C8:.+]] = arith.constant 8 : i32
+    // CHECK-SHFL-DAG: %[[C16:.+]] = arith.constant 16 : i32
+    // CHECK-SHFL-DAG: %[[C32:.+]] = arith.constant 32 : i32
+
+    // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[ARG0]], %[[C1]], %[[C32]] : i32
+    // CHECK-SHFL: %[[A0:.+]] = arith.addi %[[ARG0]], %[[S0]] : i32
+    // CHECK-SHFL: %[[S1:.+]], %{{.+}} = gpu.shuffle xor %[[A0]], %[[C2]], %[[C32]] : i32
+    // CHECK-SHFL: %[[A1:.+]] = arith.addi %[[A0]], %[[S1]] : i32
+    // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32
+    // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
+    // CHECK-SHFL: %[[S3:.+]], %{{.+}} = gpu.shuffle xor %[[A2]], %[[C8]], %[[C32]] : i32
+    // CHECK-SHFL: %[[A3:.+]] = arith.addi %[[A2]], %[[S3]] : i32
+    // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
+    // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
+    // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
+    %sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32
+    "test.consume"(%sum0) : (i32) -> ()
+
+    // CHECK-SHFL: gpu.return
+    gpu.return
+  }
+
+  // CHECK-SHFL-LABEL: gpu.func @kernel4(
+  // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
+  gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
+    // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
+    // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
+    // CHECK-SHFL-DAG: %[[C4:.+]] = arith.constant 4 : i32
+    // CHECK-SHFL-DAG: %[[C8:.+]] = arith.constant 8 : i32
+    // CHECK-SHFL-DAG: %[[C16:.+]] = arith.constant 16 : i32
+    // CHECK-SHFL-DAG: %[[C32:.+]] = arith.constant 32 : i32
+
+    // CHECK-SHFL: %[[V0:.+]] = vector.bitcast %[[ARG0]] : vector<2xf16> to vector<1xi32>
+    // CHECK-SHFL: %[[I0:.+]] = vector.extract %[[V0]][0] : i32 from vector<1xi32>
+    // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[I0]], %[[C1]], %[[C32]] : i32
+    // CHECK-SHFL: %[[BR0:.+]] = vector.broadcast %[[S0]] : i32 to vector<1xi32>
+    // CHECK-SHFL: %[[BC0:.+]] = vector.bitcast %[[BR0]] : vector<1xi32> to vector<2xf16>
+    // CHECK-SHFL: %[[ADD0:.+]] = arith.addf %[[ARG0]], %[[BC0]] : vector<2xf16>
+    // CHECK-SHFL: %[[BC1:.+]] = vector.bitcast %[[ADD0]] : vector<2xf16> to vector<1xi32>
+    // CHECK-SHFL: %[[I1:.+]] = vector.extract %[[BC1]][0] : i32 from vector<1xi32>
+    // CHECK-SHFL: gpu.shuffle xor %[[I1]], %[[C2]], %[[C32]] : i32
+    // CHECK-SHFL: arith.addf {{.+}} : vector<2xf16>
+    // CHECK-SHFL: gpu.shuffle xor %{{.+}}, %[[C4]], %[[C32]] : i32
+    // CHECK-SHFL: arith.addf {{.+}} : vector<2xf16>
+    // CHECK-SHFL: gpu.shuffle xor %{{.+}}, %[[C8]], %[[C32]] : i32
+    // CHECK-SHFL: arith.addf {{.+}} : vector<2xf16>
+    // CHECK-SHFL: %[[SL:.+]], %{{.+}} = gpu.shuffle xor %{{.+}}, %[[C16]], %[[C32]] : i32
+    // CHECK-SHFL: %[[BRL:.+]] = vector.broadcast %[[SL]] : i32 to vector<1xi32>
+    // CHECK-SHFL: %[[BCL:.+]] = vector.bitcast %[[BRL]] : vector<1xi32> to vector<2xf16>
+    // CHECK-SHFL: %[[ADDL:.+]] = arith.addf %{{.+}}, %[[BCL]] : vector<2xf16>
+    // CHECK-SHFL: "test.consume"(%[[ADDL]]) : (vector<2xf16>) -> ()
+    %sum0 = gpu.subgroup_reduce add %arg0 : (vector<2xf16>) -> (vector<2xf16>)
+    "test.consume"(%sum0) : (vector<2xf16>) -> ()
+
+    // CHECK-SHFL: gpu.return
+    gpu.return
+  }
+
+  // CHECK-SHFL-LABEL: gpu.func @kernel5(
+  // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
+  gpu.func @kernel5(%arg0: i16) kernel {
+    // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
+    // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
+    // CHECK-SHFL: %[[T0:.+]] = arith.trunci %[[S0]] : i32 to i16
+    // CHECK-SHFL: %[[A0:.+]] = arith.addi %[[ARG0]], %[[T0]] : i16
+    // CHECK-SHFL: %[[E1:.+]] = arith.extui %[[A0]] : i16 to i32
+    // CHECK-SHFL: %{{.+}}, %{{.+}} = gpu.shuffle xor %[[E1]], {{.+}} : i32
+    // CHECK-SHFL-COUNT-3: gpu.shuffle xor
+    // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
+    // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
+    // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
+    %sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16
+    "test.consume"(%sum0) : (i16) -> ()
+
+    // CHECK-SHFL: gpu.return
+    gpu.return
+  }
+
+  // CHECK-SHFL-LABEL: gpu.func @kernel6(
+  // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
+  gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
+    // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
+    // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
+    // CHECK-SHFL: %[[BC0:.+]] = vector.bitcast %[[V0]] : vector<4xi8> to vector<1xi32>
+    // CHECK-SHFL: %[[I0:.+]] = vector.extract %[[BC0]][0] : i32 from vector<1xi32>
+    // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[I0]], {{.+}} : i32
+    // CHECK-SHFL: %[[BR0:.+]] = vector.broadcast %[[S0]] : i32 to vector<1xi32>
+    // CHECK-SHFL: %[[BC1:.+]] = vector.bitcast %[[BR0]] : vector<1xi32> to vector<4xi8>
+    // CHECK-SHFL: %[[ADD0:.+]] = arith.addi %[[V0]], %[[BC1]] : vector<4xi8>
+    // CHECK-SHFL: %[[BC2:.+]] = vector.bitcast %[[ADD0]] : vector<4xi8> to vector<1xi32>
+    // CHECK-SHFL: %[[I1:.+]] = vector.extract %[[BC2]][0] : i32 from vector<1xi32>
+    // CHECK-SHFL-COUNT-4: gpu.shuffle xor
+    // CHECK-SHFL: %[[ESS:.+]] = vector.extract_strided_slice %{{.+}} {offsets = [0], sizes = [3], strides = [1]} : vector<4xi8> to vector<3xi8>
+    // CHECK-SHFL: "test.consume"(%[[ESS]]) : (vector<3xi8>) -> ()
+    %sum0 = gpu.subgroup_reduce add %arg0 : (vector<3xi8>) -> (vector<3xi8>)
+    "test.consume"(%sum0) : (vector<3xi8>) -> ()
+
+    // CHECK-SHFL: gpu.return
     gpu.return
   }
 
 }
+
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index 21cc89c0d89b0..8c13c0e2575c9 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -47,19 +48,40 @@ struct TestGpuSubgroupReduceLoweringPass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
       TestGpuSubgroupReduceLoweringPass)
 
+  TestGpuSubgroupReduceLoweringPass() = default;
+  TestGpuSubgroupReduceLoweringPass(
+      const TestGpuSubgroupReduceLoweringPass &pass)
+      : PassWrapper(pass) {}
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<arith::ArithDialect, vector::VectorDialect>();
   }
+
   StringRef getArgument() const final {
     return "test-gpu-subgroup-reduce-lowering";
   }
+
   StringRef getDescription() const final {
     return "Applies gpu.subgroup_reduce lowering patterns.";
   }
+
+  Option<bool> expandToShuffles{
+      *this, "expand-to-shuffles",
+      llvm::cl::desc("Expand subgroup_reduce ops to shuffle ops."),
+      llvm::cl::init(false)};
+
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
+
+    // Since both pattern sets match on the same ops, set higher benefit to
+    // perform fewer failing matches.
     populateGpuBreakDownSubgrupReducePatterns(patterns,
-                                              /*maxShuffleBitwidth=*/32);
+                                              /*maxShuffleBitwidth=*/32,
+                                              PatternBenefit(2));
+    if (expandToShuffles)
+      populateGpuLowerSubgroupReduceToShufflePattenrs(
+          patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);
+
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };

From 78f0991abdae723f535fda58a4bb16b53bea7010 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 2 Jan 2024 13:57:05 -0800
Subject: [PATCH 078/313] [hwasan] Workaround unsupported
 AssignmentTrackingPass (#76547)

Temporarily fix for issue #76545

Hwasan does not attach tags to @llvm.dbg.assign. It's not clear if we
can attach tags to @llvm.dbg.assign.

For now we just disable the path replacing llvm.dbg.declare with
llvm.dbg.assign.
It may reduce the quality of interactive debugging with HWASAN, but
usually it's
a smaller priority for sanitizers than the quality if reports.
---
 .../test/hwasan/TestCases/stack-overflow.c    |  3 +-
 compiler-rt/test/hwasan/TestCases/stack-uar.c |  3 +-
 compiler-rt/test/hwasan/TestCases/stack-uas.c |  3 +-
 llvm/lib/IR/DebugInfo.cpp                     |  4 +++
 .../declare-to-assign/hwasan.ll               | 35 +++++++++++++++++++
 5 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll

diff --git a/compiler-rt/test/hwasan/TestCases/stack-overflow.c b/compiler-rt/test/hwasan/TestCases/stack-overflow.c
index 4af506e3ecf45..98d75189e146d 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-overflow.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-overflow.c
@@ -1,4 +1,5 @@
-// RUN: %clang_hwasan -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O0 -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O3 -g %s -o %t && not %run %t 2>&1 | FileCheck %s
 
 // Stack histories currently are not recorded on x86.
 // XFAIL: target=x86_64{{.*}}
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar.c b/compiler-rt/test/hwasan/TestCases/stack-uar.c
index 9fd4381a8049e..76f20f37d734e 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar.c
@@ -1,5 +1,6 @@
 // Tests use-after-return detection and reporting.
-// RUN: %clang_hwasan -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O0 -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O3 -g %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clang_hwasan -g %s -o %t && not %env_hwasan_opts=symbolize=0 %run %t 2>&1 | FileCheck %s --check-prefix=NOSYM
 
 // Run the same test as above, but using the __hwasan_add_frame_record libcall.
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uas.c b/compiler-rt/test/hwasan/TestCases/stack-uas.c
index a0e4eb02dd226..c8a8ee4f7e809 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uas.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uas.c
@@ -1,5 +1,6 @@
 // Tests use-after-scope detection and reporting.
-// RUN: %clang_hwasan -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O0 -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O2 -g %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clang_hwasan -g %s -o %t && not %env_hwasan_opts=symbolize=0 %run %t 2>&1 | FileCheck %s --check-prefix=NOSYM
 
 // RUN: %clang_hwasan -mllvm -hwasan-use-after-scope=false -g %s -o %t && %run %t 2>&1
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index eab05eed428e4..c6dc42e8ac88c 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2115,6 +2115,10 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
+  // FIXME: https://github.com/llvm/llvm-project/issues/76545
+  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
+    return /*Changed*/ false;
+
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
new file mode 100644
index 0000000000000..c4b209de77017
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -0,0 +1,35 @@
+; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
+
+; CHECK: call void @llvm.dbg.declare
+
+define dso_local void @f() sanitize_hwaddress !dbg !9 {
+entry:
+  %a = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata ptr %a, metadata !13, metadata !DIExpression()), !dbg !16
+  ret void, !dbg !17
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 17.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{!"clang version 17.0.0"}
+!9 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !10, scopeLine: 1, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!10 = !DISubroutineType(types: !11)
+!11 = !{null}
+!12 = !{!13}
+!13 = !DILocalVariable(name: "a", scope: !9, file: !1, line: 1, type: !14)
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!15 = !DILocation(line: 1, column: 12, scope: !9)
+!16 = !DILocation(line: 1, column: 16, scope: !9)
+!17 = !DILocation(line: 1, column: 19, scope: !9)

From 71bcef0b0bf55a96f85b2f323b0beb13ad8e2caa Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 2 Jan 2024 14:04:56 -0800
Subject: [PATCH 079/313] [bazel] Add VectorDialect build dep

Added in c0345b4648608b44071bbb6b012b24bd07c3d29e
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 0850f7095458c..dabf274a6d9a0 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5461,6 +5461,7 @@ cc_library(
         ":Pass",
         ":SPIRVDialect",
         ":Support",
+        ":VectorDialect",
         "//llvm:Support",
     ],
 )
@@ -9361,7 +9362,7 @@ cc_binary(
         ":MlirJitRunner",
         ":Pass",
         ":ReconcileUnrealizedCasts",
-	":SCFDialect",
+        ":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVTransforms",
         ":ToLLVMIRTranslation",

From 3c127e83c07c3791e86413c22a414a030d4302e3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 2 Jan 2024 22:05:57 +0000
Subject: [PATCH 080/313] [ConstraintElim] Replace NUWSub decomp with recursive
 decomp of ops.

The current patterns for NUWSub decompositions do not handle negative
constants correctly at the moment (causing #76713).

Replace the incorrect pattern by more general code that recursively
decomposes the operands and then combines the results. This is already
done in most other places that handle operators like add/mul.

This means we fall back to the general constant handling code (fixes the
mis-compile) while also being able to support reasoning about
decomposable expressions in the SUB operands.

Fixes https://github.com/llvm/llvm-project/issues/76713.
---
 .../Scalar/ConstraintElimination.cpp          | 17 +++++++---
 .../ConstraintElimination/sub-nuw.ll          | 31 ++++++++-----------
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 49ac1e96e2554..06c87bd6dc374 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -366,6 +366,13 @@ struct Decomposition {
     append_range(Vars, Other.Vars);
   }
 
+  void sub(const Decomposition &Other) {
+    Decomposition Tmp = Other;
+    Tmp.mul(-1);
+    add(Tmp.Offset);
+    append_range(Vars, Tmp.Vars);
+  }
+
   void mul(int64_t Factor) {
     Offset = multiplyWithOverflow(Offset, Factor);
     for (auto &Var : Vars)
@@ -569,10 +576,12 @@ static Decomposition decompose(Value *V,
     return Result;
   }
 
-  if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI))) && canUseSExt(CI))
-    return {-1 * CI->getSExtValue(), {{1, Op0}}};
-  if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1))))
-    return {0, {{1, Op0}, {-1, Op1}}};
+  if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1)))) {
+    auto ResA = decompose(Op0, Preconditions, IsSigned, DL);
+    auto ResB = decompose(Op1, Preconditions, IsSigned, DL);
+    ResA.sub(ResB);
+    return ResA;
+  }
 
   return {V, IsKnownNonNegative};
 }
diff --git a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
index ba468789af5c6..5ae559dc2b1b6 100644
--- a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
@@ -316,12 +316,13 @@ define i1 @sub_nuw_neg_i16(i16 %a) {
 ; CHECK-LABEL: @sub_nuw_neg_i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NEG2:%.*]] = sub nuw i16 [[A:%.*]], -305
-; CHECK-NEXT:    br i1 false, label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 0, [[NEG2]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i16 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_2]]
 ; CHECK:       exit.2:
-; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 0
-; CHECK-NEXT:    ret i1 [[C_3]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %neg2 = sub nuw i16 %a, -305
@@ -380,7 +381,6 @@ entry:
   ret i1 %c
 }
 
-; FIXME: currently this incorrectly simplifies %c4 to true.
 define i1 @pr76713(i16 %i1, i16 %i3) {
 ; CHECK-LABEL: @pr76713(
 ; CHECK-NEXT:  entry:
@@ -394,7 +394,8 @@ define i1 @pr76713(i16 %i1, i16 %i3) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i16 [[I1]], -3
 ; CHECK-NEXT:    [[ARRAYIDX_IDX:%.*]] = mul nuw nsw i16 [[I3]], 4
 ; CHECK-NEXT:    [[I6:%.*]] = add nuw nsw i16 [[ARRAYIDX_IDX]], [[SUB]]
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[C4:%.*]] = icmp ult i16 12, [[I6]]
+; CHECK-NEXT:    ret i1 [[C4]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -417,7 +418,6 @@ else:
   ret i1 0
 }
 
-; FIXME: Currently gets mis-compiled.
 define void @sub_nuw_chained_positive_constants(i16 %a) {
 ; CHECK-LABEL: @sub_nuw_chained_positive_constants(
 ; CHECK-NEXT:  entry:
@@ -426,16 +426,13 @@ define void @sub_nuw_chained_positive_constants(i16 %a) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 [[SUB2]], 90
 ; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i16 [[A]], 120
-; CHECK-NEXT:    call void @use(i1 [[C_2]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 121
 ; CHECK-NEXT:    call void @use(i1 [[C_3]])
 ; CHECK-NEXT:    ret void
 ; CHECK:       exit.2:
-; CHECK-NEXT:    [[C_4:%.*]] = icmp ugt i16 [[A]], 120
-; CHECK-NEXT:    call void @use(i1 [[C_4]])
-; CHECK-NEXT:    [[C_5:%.*]] = icmp ugt i16 [[A]], 121
-; CHECK-NEXT:    call void @use(i1 [[C_5]])
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    call void @use(i1 false)
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -459,7 +456,6 @@ exit.2:
   ret void
 }
 
-; FIXME: Currently gets mis-compiled.
 define void @sub_nuw_chained_negative_constants(i8 %a) {
 ; CHECK-LABEL: @sub_nuw_chained_negative_constants(
 ; CHECK-NEXT:  entry:
@@ -468,14 +464,13 @@ define void @sub_nuw_chained_negative_constants(i8 %a) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[SUB2]], 20
 ; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i8 [[A]], -96
-; CHECK-NEXT:    call void @use(i1 [[C_2]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i8 [[A]], -95
 ; CHECK-NEXT:    call void @use(i1 [[C_3]])
 ; CHECK-NEXT:    ret void
 ; CHECK:       exit.2:
-; CHECK-NEXT:    call void @use(i1 true)
-; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    call void @use(i1 false)
 ; CHECK-NEXT:    ret void
 ;
 entry:

From 9943cd7c7de7ff36d23775e4fb2a449208d41569 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Tue, 2 Jan 2024 22:08:45 +0000
Subject: [PATCH 081/313] [compiler-rt] unbreak android build

---
 .../lib/sanitizer_common/sanitizer_platform_limits_posix.h     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index 1ee26e0cbbb99..34bfef1f7ef45 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -523,6 +523,7 @@ typedef long __sanitizer_clock_t;
 
 #if SANITIZER_LINUX
 typedef int __sanitizer_clockid_t;
+typedef unsigned long long __sanitizer_eventfd_t;
 #endif
 
 #if SANITIZER_LINUX
@@ -938,8 +939,6 @@ struct __sanitizer_cookie_io_functions_t {
   __sanitizer_cookie_io_seek seek;
   __sanitizer_cookie_io_close close;
 };
-
-typedef unsigned long long __sanitizer_eventfd_t;
 #endif
 
 #define IOC_NRBITS 8

From 67c2e354c3f0326947914c097bf68997ac5065b2 Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Tue, 2 Jan 2024 17:25:56 -0500
Subject: [PATCH 082/313] [mlir] Add inferContractionDims util for indexing map
 inputs (#76081)

This PR adds a util function to infer contraction dimensions given only
the indexing maps of a linalg operation.
---
 .../mlir/Dialect/Linalg/IR/LinalgInterfaces.h |  2 +
 .../Dialect/Linalg/IR/LinalgInterfaces.cpp    | 91 +++++++++++++------
 2 files changed, 65 insertions(+), 28 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
index 6c8240267e7d0..f92843a1dcb98 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
@@ -62,6 +62,8 @@ struct ContractionDimensions {
 /// `k`, indices are returned in sorted order.
 /// Returns a failure if any of `m`, `n` or `k` is empty.
 FailureOr<ContractionDimensions> inferContractionDims(LinalgOp linalgOp);
+FailureOr<ContractionDimensions>
+inferContractionDims(ArrayRef<AffineMap> indexingMaps);
 
 /// Checks whether `linalgOp` conforms to ContractionOpInterface.
 // TODO: embed within `isa<ContractionOpInterface>` if possible / natural.
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index ba419d32f22a3..291784072896c 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -176,22 +176,22 @@ static bool isContractionBody(Block &block) {
   return linalg::detail::isContractionBody(block, &isPairTemplateImpl<Args...>);
 }
 
-/// Given a `linalgOp` and one of its `opOperand`, returns the positions of the
-/// iterators of type `iter` that index the `opOperand` as a permutation.
-/// This is useful to infer various subcomputations on a given `linalgOp`.
-/// This is performed by looking up each result in the matching indexing map and
-/// determining whether:
+/// Given an `indexingMap` and its corresponding `iterators`, returns
+/// the positions of the iterators of type `iter` that are indexed by
+/// the `indexingMap` as a permutation. This is useful to infer various
+/// subcomputations on a `LinalgOp`. This is performed by looking up
+/// each result in the `indexingMap` and determining whether:
 ///   - It is a single AffineDimExpr.
 ///   - It is the only result involving this AffineDimExpr.
 static llvm::SmallDenseSet<int64_t>
-findPermutationsIndexingOperand(LinalgOp linalgOp, OpOperand *opOperand,
+findPermutationsIndexingOperand(AffineMap indexingMap,
+                                ArrayRef<utils::IteratorType> iterators,
                                 utils::IteratorType iter) {
+  assert(iterators.size() == indexingMap.getNumDims());
   llvm::SmallDenseSet<int64_t> res;
-  assert(linalgOp == opOperand->getOwner() && "expected linalgOp owner");
-  AffineMap indexingMap = linalgOp.getMatchingIndexingMap(opOperand);
   for (AffineExpr e : indexingMap.getResults()) {
     if (auto d = dyn_cast<AffineDimExpr>(e)) {
-      if (linalgOp.getIteratorTypesArray()[d.getPosition()] == iter &&
+      if (iterators[d.getPosition()] == iter &&
           llvm::count_if(indexingMap.getResults(), [d](AffineExpr e) {
             return e.isFunctionOfDim(d.getPosition());
           }) == 1)
@@ -206,6 +206,21 @@ auto par = utils::IteratorType::parallel;
 auto red = utils::IteratorType::reduction;
 } // namespace
 
+/// Infer the iterator types from the init affine map. This looks at which dims
+/// are present in the map results, and returns an iterator types array with
+/// parallel types for dims that are present, and reduction types for dims that
+/// are not present.
+static FailureOr<SmallVector<utils::IteratorType>>
+inferIteratorsFromOutMap(AffineMap map) {
+  if (!map.isProjectedPermutation())
+    return failure();
+  SmallVector<utils::IteratorType> iterators(map.getNumDims(), red);
+  for (auto expr : map.getResults())
+    if (auto dim = dyn_cast<AffineDimExpr>(expr))
+      iterators[dim.getPosition()] = par;
+  return iterators;
+}
+
 /// Find 2 parallel (m and n) and 1 reduction (k) dimension candidates that form
 /// a matmul subcomputation within `linalgOp`. These dimensions are such that:
 ///   1. The m dimension is involved in an outer-product along LHS
@@ -217,17 +232,15 @@ auto red = utils::IteratorType::reduction;
 ///   5. Optional batch dimensions that appear in all operands are captured.
 /// This allows e.g. detecting that some contraction is embedded within
 /// `linalgOp` with some orthogonal heuristic.
-FailureOr<ContractionDimensions>
-mlir::linalg::inferContractionDims(LinalgOp linalgOp) {
-  if (linalgOp.getNumDpsInits() != 1 || linalgOp.getNumDpsInputs() != 2)
-    return failure();
-
-  llvm::SmallDenseSet<int64_t> a = findPermutationsIndexingOperand(
-      linalgOp, linalgOp.getDpsInputOperand(0), par);
-  llvm::SmallDenseSet<int64_t> b = findPermutationsIndexingOperand(
-      linalgOp, linalgOp.getDpsInputOperand(1), par);
-  llvm::SmallDenseSet<int64_t> c = findPermutationsIndexingOperand(
-      linalgOp, linalgOp.getDpsInitOperand(0), par);
+static FailureOr<ContractionDimensions>
+inferContractionDimsImpl(ArrayRef<AffineMap> indexingMaps,
+                         ArrayRef<utils::IteratorType> iterators) {
+  llvm::SmallDenseSet<int64_t> a =
+      findPermutationsIndexingOperand(indexingMaps[0], iterators, par);
+  llvm::SmallDenseSet<int64_t> b =
+      findPermutationsIndexingOperand(indexingMaps[1], iterators, par);
+  llvm::SmallDenseSet<int64_t> c =
+      findPermutationsIndexingOperand(indexingMaps[2], iterators, par);
 
   // A & C - B are the iterators involved in an outer-product along A (the LHS).
   llvm::SmallDenseSet<int64_t> ac = a;
@@ -243,10 +256,10 @@ mlir::linalg::inferContractionDims(LinalgOp linalgOp) {
   llvm::set_intersect(batches, c);
 
   // A & B red are the reduction dimensions.
-  llvm::SmallDenseSet<int64_t> ra = findPermutationsIndexingOperand(
-      linalgOp, linalgOp.getDpsInputOperand(0), red);
-  llvm::SmallDenseSet<int64_t> rb = findPermutationsIndexingOperand(
-      linalgOp, linalgOp.getDpsInputOperand(1), red);
+  llvm::SmallDenseSet<int64_t> ra =
+      findPermutationsIndexingOperand(indexingMaps[0], iterators, red);
+  llvm::SmallDenseSet<int64_t> rb =
+      findPermutationsIndexingOperand(indexingMaps[1], iterators, red);
   llvm::set_intersect(ra, rb);
 
   // Return each set in sorted order.
@@ -262,6 +275,24 @@ mlir::linalg::inferContractionDims(LinalgOp linalgOp) {
   return dimensions;
 }
 
+FailureOr<ContractionDimensions>
+mlir::linalg::inferContractionDims(LinalgOp linalgOp) {
+  if (linalgOp.getNumDpsInits() != 1 || linalgOp.getNumDpsInputs() != 2)
+    return failure();
+  return inferContractionDimsImpl(linalgOp.getIndexingMapsArray(),
+                                  linalgOp.getIteratorTypesArray());
+}
+
+FailureOr<ContractionDimensions>
+mlir::linalg::inferContractionDims(ArrayRef<AffineMap> indexingMaps) {
+  if (indexingMaps.size() != 3)
+    return failure();
+  auto iterators = inferIteratorsFromOutMap(indexingMaps[2]);
+  if (failed(iterators))
+    return failure();
+  return inferContractionDimsImpl(indexingMaps, iterators.value());
+}
+
 namespace mlir::linalg::detail {
 enum class MatchContractionResult {
   Success = 0,
@@ -504,10 +535,14 @@ static FailureOr<ConvolutionDimensions>
 inferConvolutionDimsImpl(LinalgOp linalgOp,
                          ConvAccessExprWalker &inputExprWalker,
                          bool allowEmptyConvolvedDims) {
+  auto filterMap =
+      linalgOp.getMatchingIndexingMap(linalgOp.getDpsInputOperand(1));
+  auto outputMap =
+      linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(0));
   llvm::SmallDenseSet<int64_t> filterDims = findPermutationsIndexingOperand(
-      linalgOp, linalgOp.getDpsInputOperand(1), par);
+      filterMap, linalgOp.getIteratorTypesArray(), par);
   llvm::SmallDenseSet<int64_t> outputDims = findPermutationsIndexingOperand(
-      linalgOp, linalgOp.getDpsInitOperand(0), par);
+      outputMap, linalgOp.getIteratorTypesArray(), par);
 
   // unConvolvedDims & outputDims - filterDims are the batch iterators.
   llvm::SmallDenseSet<int64_t> batch = inputExprWalker.unConvolvedDims;
@@ -529,8 +564,8 @@ inferConvolutionDimsImpl(LinalgOp linalgOp,
   llvm::set_intersect(depth, inputExprWalker.unConvolvedDims);
 
   llvm::SmallDenseSet<int64_t> filterReducedDims =
-      findPermutationsIndexingOperand(linalgOp, linalgOp.getDpsInputOperand(1),
-                                      red);
+      findPermutationsIndexingOperand(filterMap,
+                                      linalgOp.getIteratorTypesArray(), red);
 
   // convolvedDims & filterReducedDims are the filter loop iterators.
   llvm::SmallDenseSet<int64_t> fl = inputExprWalker.convolvedDims;

From 41a07e668c29e219ed2f26d61da8b6b3295ff967 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Tue, 2 Jan 2024 14:44:24 -0800
Subject: [PATCH 083/313] [mlir][sparse] recognize NVidia 2:4 type for matmul
 (#76758)

This removes the temporary DENSE24 attribute and replaces it with proper
recognition of dense to 24 conversion. The compressionh will be
performed on the device prior to performing the matrix mult. Note that
we no longer need to start with the linalg version, we can lift this to
the proper named linalg op. Also renames some files into more consistent
names.
---
 .../Transforms/SparseGPUCodegen.cpp           |  28 +++-
 .../ExecutionEngine/CudaRuntimeWrappers.cpp   |   2 +-
 ...ul_lib_2to4.mlir => gpu_matmul24_lib.mlir} |  28 ++--
 ...inalg.mlir => sparse-matmul-2-4-hand.mlir} | 139 +++++++++--------
 .../CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir   | 141 ++++++++----------
 .../CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir |  41 +++--
 6 files changed, 204 insertions(+), 175 deletions(-)
 rename mlir/test/Dialect/SparseTensor/GPU/{gpu_matmul_lib_2to4.mlir => gpu_matmul24_lib.mlir} (88%)
 rename mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/{sparse-matmul-2-4-lib-from-linalg.mlir => sparse-matmul-2-4-hand.mlir} (66%)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 8af3b694c4d97..87a37a7926e9e 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -448,6 +448,23 @@ static bool isAdmissibleBSR(SparseTensorType &aTp) {
   return false;
 }
 
+/// Test for 2:4 matrix with suitable metadata.
+static bool isAdmissible24(SparseTensorType &aTp) {
+  return aTp.getDimRank() == 2 && aTp.getLvlRank() == 3 && aTp.isDenseLvl(0) &&
+         aTp.isDenseLvl(1) && aTp.is2OutOf4Lvl(2) && isAdmissibleMetaData(aTp);
+}
+
+/// Test for conversion into 2:4 matrix.
+static bool isConversionInto24(Value v) {
+  if (auto cnv = v.getDefiningOp<ConvertOp>()) {
+    Value a = cnv.getResult();
+    Value d = cnv.getSource();
+    SparseTensorType aTp = getSparseTensorType(a);
+    return isDenseTensor(d) && isAdmissible24(aTp);
+  }
+  return false;
+}
+
 /// Returns a suitable sparse format for the operation and given operand
 /// types with cuSparse, or kNone if none is available.
 static CuSparseFormat getCuSparseFormat(SparseTensorType aTp,
@@ -925,6 +942,15 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
   Value C = op.getOperand(2); // we have C = AB
   SmallVector<Value> tokens;
 
+  // The cuSparselt API currently only allows pruning and compression
+  // to occur on the device. So we recognize the pattern
+  //    A' = convert A  ; dense to 2:4
+  //    C  = A'B        ; 2:4 matrix mult
+  // and then perform compression and matrix multiplication on device.
+  auto cnv = A.getDefiningOp<ConvertOp>();
+  assert(cnv);
+  A = cnv.getSource();
+
   // All input should be dense tensors.
   if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C))
     return failure();
@@ -1260,7 +1286,7 @@ struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
         maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) {
       if (!isDenseTensor(op.getOperand(0)) && !isDenseTensor(op.getOperand(1)))
         return rewriteSpGEMM(rewriter, op, enableRT);
-      if (op->getAttr("DENSE24"))
+      if (isConversionInto24(op.getOperand(0)))
         return rewrite2To4SpMM(rewriter, op);
       return rewriteSpMM(rewriter, op, enableRT);
     }
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index c45320a674568..b9a3429e37b88 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -970,7 +970,7 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
   // Note that this adds a synchronization on the stream.
   // TODO: Do we want that?
   if (prune_flag == 2) {
-    int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream);
+    int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream, false);
     CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
         &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
     int valid = 0;
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
similarity index 88%
rename from mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
rename to mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
index f584977e96415..6fe7ec906f30e 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
@@ -1,5 +1,13 @@
 // RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
+#NV_24 = #sparse_tensor.encoding<{
+  map = ( i, j ) ->
+  ( i            : dense,
+    j floordiv 4 : dense,
+    j mod 4      : block2_4
+  )
+}>
+
 // CHECK-LABEL:   func.func @matmul(
 // CHECK-SAME:      %[[VAL_0:.*0]]: tensor<?x?xf16>,
 // CHECK-SAME:      %[[VAL_1:.*1]]: tensor<?x?xf16>,
@@ -51,18 +59,14 @@
 // CHECK:           %[[VAL_55:.*]] = bufferization.to_tensor %[[VAL_19]] : memref<?x?xf16>
 // CHECK:           return %[[VAL_55]] : tensor<?x?xf16>
 // CHECK:         }
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 module {
-  func.func @matmul(%arg0: tensor<?x?xf16>, %arg1: tensor<?x?xf16>, %arg2: tensor<?x?xf16>) -> tensor<?x?xf16> {
-    %0 = linalg.generic { DENSE24, indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<?x?xf16>, tensor<?x?xf16>) outs(%arg2 : tensor<?x?xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %1 = arith.mulf %in, %in_0 : f16
-      %2 = arith.addf %out, %1 : f16
-      linalg.yield %2 : f16
-    } -> tensor<?x?xf16>
-    return %0 : tensor<?x?xf16>
+  func.func @matmul(%Ad: tensor<?x?xf16>,
+                    %B: tensor<?x?xf16>,
+		    %Cin: tensor<?x?xf16>) -> tensor<?x?xf16> {
+    %A = sparse_tensor.convert %Ad : tensor<?x?xf16> to tensor<?x?xf16, #NV_24>
+    %C = linalg.matmul
+      ins(%A, %B: tensor<?x?xf16, #NV_24>, tensor<?x?xf16>)
+      outs(%Cin: tensor<?x?xf16>) -> tensor<?x?xf16>
+    return %C : tensor<?x?xf16>
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-hand.mlir
similarity index 66%
rename from mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
rename to mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-hand.mlir
index d7e9cedec4ccd..117832df95b46 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-hand.mlir
@@ -1,40 +1,58 @@
 // NOTE: this test requires gpu-sm80 and cusparselt
 //
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
+// DEFINE: %{compile} = mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
+// DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
+// DEFINE: %s
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
 // DEFINE:   --e main --entry-point-result=void \
 // DEFINE: | FileCheck %s
 //
-// with RT lib:
-//
-// RUN: %{compile} enable-runtime-library=true"  | %{run}
-//
-// without RT lib:
-//
-// RUN: %{compile} enable-runtime-library=false" | %{run}
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+// RUN: %{compile} | %{run}
 
 module {
   llvm.func @mgpuCreateSparseLtEnv()
   llvm.func @mgpuDestroySparseLtEnv()
 
-  //
-  // TODO: This uses our temporary ATTRIBUTE, replace with 2:4 type!
-  //
-  func.func @matmul_2to4(%arg0: tensor<16x32xf16>, %arg1: tensor<32x16xf16>, %arg2: tensor<16x16xf16>) -> tensor<16x16xf16> {
-    %0 = linalg.generic { DENSE24, indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<16x32xf16>, tensor<32x16xf16>) outs(%arg2 : tensor<16x16xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %1 = arith.mulf %in, %in_0 : f16
-      %2 = arith.addf %out, %1 : f16
-      linalg.yield %2 : f16
-    } -> tensor<16x16xf16>
-    return %0 : tensor<16x16xf16>
+  // cuSparselt version for matmul coded by hand.
+  func.func @matmul24(%a : memref<16x32xf16>,
+                      %b : memref<32x16xf16>,
+                      %c : memref<16x16xf16>) {
+    %c0  = arith.constant 0.0 : f16
+    %c1  = arith.constant 1   : index
+    %c2  = arith.constant 2   : index
+    %c8  = arith.constant 8   : index
+    %c16 = arith.constant 16  : index
+    %c32 = arith.constant 32  : index
+    %c1048576 = arith.constant 1048576 : index
+    %token0 = gpu.wait async
+    %d_a, %token1 = gpu.alloc async [%token0] () : memref<16x32xf16>
+    %d_b, %token2 = gpu.alloc async [%token1] () : memref<32x16xf16>
+    %d_c, %token3 = gpu.alloc async [%token2] () : memref<16x16xf16>
+    %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16>
+    %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16>
+    %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16>
+    %spmat, %token8 = gpu.create_2to4_spmat async  [%token6]{PRUNE_AND_CHECK} %c16, %c32, %d_a: memref<16x32xf16>
+    %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16>
+    %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16>
+    %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
+    %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref<?xf16>
+    %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref<?xf16>
+    %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref<?xf16>
+    %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
+    %token16 = gpu.destroy_sp_mat async [%token15] %spmat
+    %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat
+    %token18 = gpu.destroy_dn_tensor async [%token17] %dnmat2
+    %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
+    %token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16>
+    %token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16>
+    %token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16>
+    %token23 = gpu.dealloc async [%token22] %mem3 : memref<?xf16>
+    %token24 = gpu.dealloc async [%token23] %mem2 : memref<?xf16>
+    %token25 = gpu.dealloc async [%token24] %mem1 : memref<?xf16>
+    gpu.wait [%token25]
+    return
   }
 
   //
@@ -54,50 +72,49 @@ module {
     %c64 = arith.constant 64  : index
 
     // Matrices A, B, C (16x32, 32x16, 16x16).
+    %a = memref.alloc() : memref<16x32xf16> // 16x32 with 2:4, row-major
+    %b = memref.alloc() : memref<32x16xf16> // regular dense   column-major
+    %c = memref.alloc() : memref<16x16xf16> // accumulator     row-major
 
     //
     // Setup matrix A.
     //
-    %DA = tensor.generate {
-    ^bb0(%i: index, %j: index):
-      // (i+ j/2 + 1) if j %2 == 0 else 0
-      %cf0 = arith.constant 0.0 : f16
-      %cf1 = arith.constant 1.0 : f16
-      %j_2 = arith.floordivsi %j, %c2 : index
-      %quotient = arith.remsi %j, %c2 : index
-      %sum = arith.addi %i, %j_2 : index
-      %sum_i = arith.index_cast %sum : index to i64
-      %sum_f = arith.uitofp %sum_i : i64 to f16
-      %sum_f_plus1 = arith.addf %sum_f, %cf1 : f16
-      %is_zero = arith.cmpi "eq", %quotient, %c0 : index
-      %s = arith.select %is_zero, %sum_f_plus1, %cf0 : f16
-      tensor.yield %s : f16
-    } : tensor<16x32xf16>
+    scf.for %ai = %c0 to %c16 step %c1 {
+      scf.for %aj = %c0 to %c16 step %c1 {
+        %cf0  = arith.constant 0.0: f16
+        %a0 = arith.addi %ai, %aj : index
+        %a1 = arith.addi %a0, %c1 : index
+        %a2 = arith.index_cast %a1 : index to i32
+        %a3 = arith.sitofp %a2 : i32 to f16
+        %ajj = arith.muli %aj, %c2 : index
+        %ajj2 = arith.addi %ajj, %c1 : index
+        memref.store %a3, %a[%ai, %ajj] : memref<16x32xf16>
+        memref.store %cf0, %a[%ai, %ajj2] : memref<16x32xf16>
+      }
+    }
 
     //
     // Setup matrix B.
     //
-    %DB = tensor.generate {
-    ^bb0(%i: index, %j: index):
-      // if j_i >=8, j_i - 8 else 0
-      %is_ge8 = arith.cmpi "sge", %j, %c8 : index
-      %j_minus8 = arith.subi %j, %c8 : index
-      %j2 = arith.select %is_ge8, %j_minus8, %j : index
-      %r_i = arith.subi %j2, %i : index
-      %r_i64 = arith.index_cast %r_i : index to i64
-      %r_f = arith.sitofp %r_i64 : i64 to f16
-      tensor.yield %r_f : f16
-    } : tensor<32x16xf16>
+    scf.for %bi = %c0 to %c8 step %c1 {
+      scf.for %bj = %c0 to %c32 step %c1 {
+        %b0 = arith.subi %bi, %bj : index
+        %b1 = arith.index_cast %b0 : index to i32
+        %b2 = arith.sitofp %b1 : i32 to f16
+        %bii = arith.addi %bi, %c8 : index
+        memref.store %b2, %b[%bj, %bi] : memref<32x16xf16>
+        memref.store %b2, %b[%bj, %bii] : memref<32x16xf16>
+      }
+    }
 
     //
     // Reset matrix C.
     //
-    %DC = tensor.generate {
-    ^bb0(%i: index, %j: index):
-      %cf0 = arith.constant 0.0 : f16
-      tensor.yield %cf0 : f16
-    } : tensor<16x16xf16>
-
+    scf.for %ci = %c0 to %c16 step %c1 {
+      scf.for %cj = %c0 to %c16 step %c1 {
+        memref.store %f0, %c[%ci, %cj] : memref<16x16xf16>
+      }
+    }
 
     //
     // Sanity check on 16x32 full 2:4 input matrix A.
@@ -121,7 +138,7 @@ module {
     // CHECK-NEXT: ( 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0 )
     //
     scf.for %pai = %c0 to %c16 step %c1 {
-      %pa0 = vector.transfer_read %DA[%pai, %c0], %f0 : tensor<16x32xf16>, vector<32xf16>
+      %pa0 = vector.transfer_read %a[%pai, %c0], %f0 : memref<16x32xf16>, vector<32xf16>
       vector.print %pa0 : vector<32xf16>
     }
 
@@ -163,14 +180,12 @@ module {
     //
     //
     scf.for %pbi = %c0 to %c32 step %c1 {
-      %pb0 = vector.transfer_read %DB[%pbi, %c0], %f0 : tensor<32x16xf16>, vector<16xf16>
+      %pb0 = vector.transfer_read %b[%pbi, %c0], %f0 : memref<32x16xf16>, vector<16xf16>
       vector.print %pb0 : vector<16xf16>
     }
 
     // Call the kernel.
-    %t1  = arith.constant 1  : index
-    %t32 = arith.constant 32 : index
-    %c_out = call @matmul_2to4 (%DA, %DB, %DC): (tensor<16x32xf16>, tensor<32x16xf16>, tensor<16x16xf16>) -> tensor<16x16xf16>
+    call @matmul24(%a, %b, %c): (memref<16x32xf16>, memref<32x16xf16>, memref<16x16xf16>) -> ()
 
     //
     // Verify computed matrix C.
@@ -193,7 +208,7 @@ module {
     // CHECK-NEXT: ( -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688, -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688  )
     //
     scf.for %pci = %c0 to %c16 step %c1 {
-      %pc0 = vector.transfer_read %c_out[%pci, %c0], %f0 : tensor<16x16xf16>, vector<16xf16>
+      %pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x16xf16>, vector<16xf16>
       vector.print %pc0 : vector<16xf16>
     }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index daf29d5290bab..17b50b46d073a 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -1,57 +1,41 @@
 // NOTE: this test requires gpu-sm80 and cusparselt
 //
-// DEFINE: %{compile} = mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
-// DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
-// DEFINE: %s
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE:   --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
 // DEFINE:   --e main --entry-point-result=void \
 // DEFINE: | FileCheck %s
 //
-// RUN: %{compile} | %{run}
+// with RT lib:
+//
+// RUN: %{compile} enable-runtime-library=true"  | %{run}
+//
+// without RT lib:
+//
+// RUN: %{compile} enable-runtime-library=false" | %{run}
+
+#NV_24 = #sparse_tensor.encoding<{
+  map = ( i, j ) ->
+  ( i            : dense,
+    j floordiv 4 : dense,
+    j mod 4      : block2_4
+  )
+}>
 
 module {
   llvm.func @mgpuCreateSparseLtEnv()
   llvm.func @mgpuDestroySparseLtEnv()
 
-  func.func @sampled_matmul(%a : memref<16x32xf16>,
-                            %b : memref<32x16xf16>,
-                            %c : memref<16x16xf16>) {
-    %c0  = arith.constant 0.0 : f16
-    %c1  = arith.constant 1   : index
-    %c2  = arith.constant 2   : index
-    %c8  = arith.constant 8   : index
-    %c16 = arith.constant 16  : index
-    %c32 = arith.constant 32  : index
-    %c1048576 = arith.constant 1048576 : index
-    %token0 = gpu.wait async
-    %d_a, %token1 = gpu.alloc async [%token0] () : memref<16x32xf16>
-    %d_b, %token2 = gpu.alloc async [%token1] () : memref<32x16xf16>
-    %d_c, %token3 = gpu.alloc async [%token2] () : memref<16x16xf16>
-    %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16>
-    %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16>
-    %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16>
-    %spmat, %token8 = gpu.create_2to4_spmat async  [%token6]{PRUNE_AND_CHECK} %c16, %c32, %d_a: memref<16x32xf16>
-    %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16>
-    %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16>
-    %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
-    %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref<?xf16>
-    %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref<?xf16>
-    %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref<?xf16>
-    %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
-    %token16 = gpu.destroy_sp_mat async [%token15] %spmat
-    %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat
-    %token18 = gpu.destroy_dn_tensor async [%token17] %dnmat2
-    %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
-    %token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16>
-    %token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16>
-    %token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16>
-    %token23 = gpu.dealloc async [%token22] %mem3 : memref<?xf16>
-    %token24 = gpu.dealloc async [%token23] %mem2 : memref<?xf16>
-    %token25 = gpu.dealloc async [%token24] %mem1 : memref<?xf16>
-    gpu.wait [%token25]
-    return
+  func.func @matmul24(%Ad: tensor<16x32xf16>,
+                      %B: tensor<32x16xf16>,
+                      %Cin: tensor<16x16xf16>) -> tensor<16x16xf16> {
+    %A = sparse_tensor.convert %Ad : tensor<16x32xf16> to tensor<16x32xf16, #NV_24>
+    %C = linalg.matmul
+      ins(%A, %B: tensor<16x32xf16, #NV_24>, tensor<32x16xf16>)
+      outs(%Cin: tensor<16x16xf16>) -> tensor<16x16xf16>
+    return %C : tensor<16x16xf16>
   }
 
   //
@@ -71,49 +55,50 @@ module {
     %c64 = arith.constant 64  : index
 
     // Matrices A, B, C (16x32, 32x16, 16x16).
-    %a = memref.alloc() : memref<16x32xf16> // 16x32 with 2:4, row-major
-    %b = memref.alloc() : memref<32x16xf16> // regular dense   column-major
-    %c = memref.alloc() : memref<16x16xf16> // accumulator     row-major
 
     //
     // Setup matrix A.
     //
-    scf.for %ai = %c0 to %c16 step %c1 {
-      scf.for %aj = %c0 to %c16 step %c1 {
-        %cf0  = arith.constant 0.0: f16
-        %a0 = arith.addi %ai, %aj : index
-        %a1 = arith.addi %a0, %c1 : index
-        %a2 = arith.index_cast %a1 : index to i32
-        %a3 = arith.sitofp %a2 : i32 to f16
-        %ajj = arith.muli %aj, %c2 : index
-        %ajj2 = arith.addi %ajj, %c1 : index
-        memref.store %a3, %a[%ai, %ajj] : memref<16x32xf16>
-        memref.store %cf0, %a[%ai, %ajj2] : memref<16x32xf16>
-      }
-    }
+    %DA = tensor.generate {
+    ^bb0(%i: index, %j: index):
+      // (i+ j/2 + 1) if j %2 == 0 else 0
+      %cf0 = arith.constant 0.0 : f16
+      %cf1 = arith.constant 1.0 : f16
+      %j_2 = arith.floordivsi %j, %c2 : index
+      %quotient = arith.remsi %j, %c2 : index
+      %sum = arith.addi %i, %j_2 : index
+      %sum_i = arith.index_cast %sum : index to i64
+      %sum_f = arith.uitofp %sum_i : i64 to f16
+      %sum_f_plus1 = arith.addf %sum_f, %cf1 : f16
+      %is_zero = arith.cmpi "eq", %quotient, %c0 : index
+      %s = arith.select %is_zero, %sum_f_plus1, %cf0 : f16
+      tensor.yield %s : f16
+    } : tensor<16x32xf16>
 
     //
     // Setup matrix B.
     //
-    scf.for %bi = %c0 to %c8 step %c1 {
-      scf.for %bj = %c0 to %c32 step %c1 {
-        %b0 = arith.subi %bi, %bj : index
-        %b1 = arith.index_cast %b0 : index to i32
-        %b2 = arith.sitofp %b1 : i32 to f16
-        %bii = arith.addi %bi, %c8 : index
-        memref.store %b2, %b[%bj, %bi] : memref<32x16xf16>
-        memref.store %b2, %b[%bj, %bii] : memref<32x16xf16>
-      }
-    }
+    %DB = tensor.generate {
+    ^bb0(%i: index, %j: index):
+      // if j_i >=8, j_i - 8 else 0
+      %is_ge8 = arith.cmpi "sge", %j, %c8 : index
+      %j_minus8 = arith.subi %j, %c8 : index
+      %j2 = arith.select %is_ge8, %j_minus8, %j : index
+      %r_i = arith.subi %j2, %i : index
+      %r_i64 = arith.index_cast %r_i : index to i64
+      %r_f = arith.sitofp %r_i64 : i64 to f16
+      tensor.yield %r_f : f16
+    } : tensor<32x16xf16>
 
     //
     // Reset matrix C.
     //
-    scf.for %ci = %c0 to %c16 step %c1 {
-      scf.for %cj = %c0 to %c16 step %c1 {
-        memref.store %f0, %c[%ci, %cj] : memref<16x16xf16>
-      }
-    }
+    %DC = tensor.generate {
+    ^bb0(%i: index, %j: index):
+      %cf0 = arith.constant 0.0 : f16
+      tensor.yield %cf0 : f16
+    } : tensor<16x16xf16>
+
 
     //
     // Sanity check on 16x32 full 2:4 input matrix A.
@@ -137,7 +122,7 @@ module {
     // CHECK-NEXT: ( 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0 )
     //
     scf.for %pai = %c0 to %c16 step %c1 {
-      %pa0 = vector.transfer_read %a[%pai, %c0], %f0 : memref<16x32xf16>, vector<32xf16>
+      %pa0 = vector.transfer_read %DA[%pai, %c0], %f0 : tensor<16x32xf16>, vector<32xf16>
       vector.print %pa0 : vector<32xf16>
     }
 
@@ -179,12 +164,16 @@ module {
     //
     //
     scf.for %pbi = %c0 to %c32 step %c1 {
-      %pb0 = vector.transfer_read %b[%pbi, %c0], %f0 : memref<32x16xf16>, vector<16xf16>
+      %pb0 = vector.transfer_read %DB[%pbi, %c0], %f0 : tensor<32x16xf16>, vector<16xf16>
       vector.print %pb0 : vector<16xf16>
     }
 
     // Call the kernel.
-    call @sampled_matmul (%a, %b, %c): (memref<16x32xf16>, memref<32x16xf16>, memref<16x16xf16>) -> ()
+    %t1  = arith.constant 1  : index
+    %t32 = arith.constant 32 : index
+    %c_out = call @matmul24(%DA, %DB, %DC): (tensor<16x32xf16>,
+                                             tensor<32x16xf16>,
+                                             tensor<16x16xf16>) -> tensor<16x16xf16>
 
     //
     // Verify computed matrix C.
@@ -207,7 +196,7 @@ module {
     // CHECK-NEXT: ( -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688, -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688  )
     //
     scf.for %pci = %c0 to %c16 step %c1 {
-      %pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x16xf16>, vector<16xf16>
+      %pc0 = vector.transfer_read %c_out[%pci, %c0], %f0 : tensor<16x16xf16>, vector<16xf16>
       vector.print %pc0 : vector<16xf16>
     }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
index e307286002e39..eb99a027a8860 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
@@ -16,34 +16,27 @@
 //
 // RUN: %{compile} enable-runtime-library=false" | %{run}
 
-#map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#NV_24 = #sparse_tensor.encoding<{
+  map = ( i, j ) ->
+  ( i            : dense,
+    j floordiv 4 : dense,
+    j mod 4      : block2_4
+  )
+}>
 
 module {
 
   llvm.func @mgpuCreateSparseLtEnv()
   llvm.func @mgpuDestroySparseLtEnv()
 
-  //
-  // TODO: This uses our temporary ATTRIBUTE, replace with 2:4 type!
-  //
-  func.func @matmul(%arg0: tensor<16x16xf16>,
-                    %arg1: tensor<16x16xf16>,
-		    %arg2: tensor<16x16xf16>) -> tensor<16x16xf16> {
-    %0 = linalg.generic {
-       DENSE24,
-       indexing_maps = [#map0, #map1, #map2],
-       iterator_types = ["parallel", "parallel", "reduction"]
-    }
-     ins(%arg0, %arg1 : tensor<16x16xf16>, tensor<16x16xf16>)
-     outs(%arg2 : tensor<16x16xf16>) {
-         ^bb0(%in: f16, %in_0: f16, %out: f16):
-           %1 = arith.mulf %in, %in_0 : f16
-           %2 = arith.addf %out, %1 : f16
-           linalg.yield %2 : f16
-       } -> tensor<16x16xf16>
-    return %0 : tensor<16x16xf16>
+  func.func @matmul24(%Ad: tensor<16x16xf16>,
+                      %B: tensor<16x16xf16>,
+                      %Cin: tensor<16x16xf16>) -> tensor<16x16xf16> {
+    %A = sparse_tensor.convert %Ad : tensor<16x16xf16> to tensor<16x16xf16, #NV_24>
+    %C = linalg.matmul
+      ins(%A, %B: tensor<16x16xf16, #NV_24>, tensor<16x16xf16>)
+      outs(%Cin: tensor<16x16xf16>) -> tensor<16x16xf16>
+    return %C : tensor<16x16xf16>
   }
 
   func.func @main() {
@@ -81,7 +74,9 @@ module {
     // By effectively computing D = A B + C with id(B) and zero(C)
     // the resulting matrix returns the pruned A back to the caller.
     //
-    %D = call @matmul(%A, %B, %C): (tensor<16x16xf16>, tensor<16x16xf16>, tensor<16x16xf16>) -> (tensor<16x16xf16>)
+    %D = call @matmul24(%A, %B, %C): (tensor<16x16xf16>,
+                                      tensor<16x16xf16>,
+                                      tensor<16x16xf16>) -> (tensor<16x16xf16>)
 
     //
     // This was the original matrix.

From fb32977ac768f27890af28308a6968c30af2aa3e Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 2 Jan 2024 16:53:53 -0600
Subject: [PATCH 084/313]  [Libomptarget] Fix RPC-based malloc on NVPTX 
 (#72440)

Summary:
The device allocator on NVPTX architectures is enqueued to a stream that
the kernel is potentially executing on. This can lead to deadlocks as
the kernel will not proceed until the allocation is complete and the
allocation will not proceed until the kernel is complete. CUDA 11.2
introduced async allocations that we can manually place on separate
streams to combat this. This patch makes a new allocation type that's
guaranteed to be non-blocking so it will actually make progress, only
Nvidia needs to care about this as the others are not blocking in this
way by default.

I had originally tried to make the `alloc` and `free` methods take a
`__tgt_async_info`. However, I observed that with the large volume of
streams being created by a parallel test it quickly locked up the system
as presumably too many streams were being created. This implementation
not just creates a new stream and immediately destroys it. This
obviously isn't very fast, but it at least gets the cases to stop
deadlocking for now.
---
 openmp/libomptarget/include/omptarget.h       |  4 ++-
 .../plugins-nextgen/amdgpu/src/rtl.cpp        |  2 ++
 .../plugins-nextgen/common/src/RPC.cpp        |  7 +++--
 .../cuda/dynamic_cuda/cuda.cpp                |  3 +++
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |  2 ++
 .../plugins-nextgen/cuda/src/rtl.cpp          | 27 +++++++++++++++++++
 .../generic-elf-64bit/src/rtl.cpp             |  1 +
 openmp/libomptarget/test/libc/malloc.c        | 10 ++++++-
 8 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 476a158019d3c..d5602eec0d07c 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -103,7 +103,9 @@ enum TargetAllocTy : int32_t {
   TARGET_ALLOC_DEVICE = 0,
   TARGET_ALLOC_HOST,
   TARGET_ALLOC_SHARED,
-  TARGET_ALLOC_DEFAULT
+  TARGET_ALLOC_DEFAULT,
+  /// The allocation will not block on other streams.
+  TARGET_ALLOC_DEVICE_NON_BLOCKING,
 };
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index fe435a3f55855..0411c67013342 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2112,6 +2112,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     switch (Kind) {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemoryPool = CoarseGrainedMemoryPools[0];
       break;
     case TARGET_ALLOC_HOST:
@@ -3315,6 +3316,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
   case TARGET_ALLOC_DEVICE:
+  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
     MemoryPool = CoarseGrainedMemoryPools[0];
     break;
   case TARGET_ALLOC_HOST:
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
index 60e0540e96022..54aced11b31c3 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
@@ -62,15 +62,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
         "Failed to initialize RPC server for device %d: %d", DeviceId, Err);
 
   // Register a custom opcode handler to perform plugin specific allocation.
-  // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
   auto MallocHandler = [](rpc_port_t Port, void *Data) {
     rpc_recv_and_send(
         Port,
         [](rpc_buffer_t *Buffer, void *Data) {
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-          Buffer->data[0] = reinterpret_cast<uintptr_t>(
-              Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+          Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+              Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
         },
         Data);
   };
@@ -88,7 +87,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
           Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                      TARGET_ALLOC_DEVICE);
+                      TARGET_ALLOC_DEVICE_NON_BLOCKING);
         },
         Data);
   };
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 56c4404ac2d5c..5ec3adb9e4e3a 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -43,6 +43,7 @@ DLWRAP(cuLaunchKernel, 11)
 DLWRAP(cuMemAlloc, 2)
 DLWRAP(cuMemAllocHost, 2)
 DLWRAP(cuMemAllocManaged, 3)
+DLWRAP(cuMemAllocAsync, 3)
 
 DLWRAP(cuMemcpyDtoDAsync, 4)
 DLWRAP(cuMemcpyDtoH, 3)
@@ -52,6 +53,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4)
 
 DLWRAP(cuMemFree, 1)
 DLWRAP(cuMemFreeHost, 1)
+DLWRAP(cuMemFreeAsync, 2)
+
 DLWRAP(cuModuleGetFunction, 3)
 DLWRAP(cuModuleGetGlobal, 4)
 
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 3e0307759924b..32031c28f8797 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
 CUresult cuMemAlloc(CUdeviceptr *, size_t);
 CUresult cuMemAllocHost(void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
 
 CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
 CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
 
 CUresult cuMemFree(CUdeviceptr);
 CUresult cuMemFreeHost(void *);
+CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
 
 CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
 CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index b0dff917dd0be..0005bff7a8035 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -63,6 +63,14 @@ cuMemGetAllocationGranularity(size_t *granularity,
                               CUmemAllocationGranularity_flags option) {}
 #endif
 
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
+// Forward declarations of asynchronous memory management functions. This is
+// necessary for older versions of CUDA.
+CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
+
+CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
+#endif
+
 /// Class implementing the CUDA device images properties.
 struct CUDADeviceImageTy : public DeviceImageTy {
   /// Create the CUDA image with the id and the target image pointer.
@@ -488,6 +496,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
       Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
       MemAlloc = (void *)DevicePtr;
       break;
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+      CUstream Stream;
+      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+        break;
+      if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
+        break;
+      cuStreamSynchronize(Stream);
+      Res = cuStreamDestroy(Stream);
+      MemAlloc = (void *)DevicePtr;
+    }
     }
 
     if (auto Err =
@@ -518,6 +536,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_HOST:
       Res = cuMemFreeHost(TgtPtr);
       break;
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+      CUstream Stream;
+      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+        break;
+      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+      cuStreamSynchronize(Stream);
+      if ((Res = cuStreamDestroy(Stream)))
+        break;
+    }
     }
 
     if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 88b5236d31f48..43569f2505559 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -215,6 +215,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_DEVICE:
     case TARGET_ALLOC_HOST:
     case TARGET_ALLOC_SHARED:
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemAlloc = std::malloc(Size);
       break;
     }
diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c
index c18a724930f41..b587b618472e4 100644
--- a/openmp/libomptarget/test/libc/malloc.c
+++ b/openmp/libomptarget/test/libc/malloc.c
@@ -13,7 +13,7 @@ int main() {
   unsigned *d_x;
 #pragma omp target map(from : d_x)
   {
-    d_x = malloc(sizeof(unsigned));
+    d_x = (unsigned *)malloc(sizeof(unsigned));
     *d_x = 1;
   }
 
@@ -23,6 +23,14 @@ int main() {
 #pragma omp target is_device_ptr(d_x)
   { free(d_x); }
 
+#pragma omp target teams num_teams(64)
+#pragma omp parallel num_threads(32)
+  {
+    int *ptr = (int *)malloc(sizeof(int));
+    *ptr = 42;
+    free(ptr);
+  }
+
   // CHECK: PASS
   if (h_x == 1)
     fputs("PASS\n", stdout);

From fc5f51cf5af4364b38bf22e491d46e1e892ade0c Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Tue, 2 Jan 2024 14:54:54 -0800
Subject: [PATCH 085/313] [WebAssembly][Object]Use file offset as function
 symbol address for linked files (#76198)

WebAssembly doesn't have a single virtual memory space the way other
object formats or architectures do, so "addresses" mean different things
depending on the context.
Function symbol addresses in object files are offsets from the start of
the code section. This is good for linking and relocation. However when
dealing with linked binaries, offsets from the start of the file/module
are more often used (e.g. for stack traces in browsers), and are more
useful for use cases like binary size attribution. This PR changes
Object to use the file offset instead of the section offset for function
symbols, but only for linked (non-DSO) files.

This implements item number 4 from #76107
---
 llvm/lib/Object/WasmObjectFile.cpp            | 16 +++-
 llvm/test/tools/llvm-nm/wasm/linked.yaml      | 74 ++++++++++++++++++
 .../wasm/linked-symbol-table.yaml             | 75 +++++++++++++++++++
 3 files changed, 161 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/tools/llvm-nm/wasm/linked.yaml
 create mode 100644 llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml

diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 40665d686cf93..ccc29d0cb73d6 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -1667,10 +1667,18 @@ Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
 Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
   auto &Sym = getWasmSymbol(Symb);
   if (Sym.Info.Kind == wasm::WASM_SYMBOL_TYPE_FUNCTION &&
-      isDefinedFunctionIndex(Sym.Info.ElementIndex))
-    return getDefinedFunction(Sym.Info.ElementIndex).CodeSectionOffset;
-  else
-    return getSymbolValue(Symb);
+      isDefinedFunctionIndex(Sym.Info.ElementIndex)) {
+    // For object files, use the section offset. The linker relies on this.
+    // For linked files, use the file offset. This behavior matches the way
+    // browsers print stack traces and is useful for binary size analysis.
+    // (see https://webassembly.github.io/spec/web-api/index.html#conventions)
+    uint32_t Adjustment = isRelocatableObject() || isSharedObject()
+                              ? 0
+                              : Sections[CodeSection].Offset;
+    return getDefinedFunction(Sym.Info.ElementIndex).CodeSectionOffset +
+           Adjustment;
+  }
+  return getSymbolValue(Symb);
 }
 
 uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const {
diff --git a/llvm/test/tools/llvm-nm/wasm/linked.yaml b/llvm/test/tools/llvm-nm/wasm/linked.yaml
new file mode 100644
index 0000000000000..992c1811743b7
--- /dev/null
+++ b/llvm/test/tools/llvm-nm/wasm/linked.yaml
@@ -0,0 +1,74 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-nm %t.wasm | FileCheck %s
+
+# CHECK: 0000009f T my_func_export
+# CHECK-NEXT: 0000002a D my_global_export
+# CHECK-NEXT: 00000000 D my_table_export
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            TABLE
+    Tables:
+      - Index:           0
+        ElemType:        FUNCREF
+        Limits:
+          Flags:           [ HAS_MAX ]
+          Minimum:         0x1
+          Maximum:         0x1
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+      - Name:            my_table_export
+        Kind:            TABLE
+        Index:           0
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         ''
diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
new file mode 100644
index 0000000000000..6dd949a441496
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
@@ -0,0 +1,75 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-objdump -t %t.wasm | FileCheck %s
+#
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 0000009f g F CODE my_func_export
+# CHECK-NEXT: 0000002a g O DATA my_global_export
+# CHECK-NEXT: 00000000 g   TABLE my_table_export
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            TABLE
+    Tables:
+      - Index:           0
+        ElemType:        FUNCREF
+        Limits:
+          Flags:           [ HAS_MAX ]
+          Minimum:         0x1
+          Maximum:         0x1
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+      - Name:            my_table_export
+        Kind:            TABLE
+        Index:           0
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         ''

From b64992a367efbbea0b7ae3dd7711b2be26128f99 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Tue, 2 Jan 2024 15:22:26 -0800
Subject: [PATCH 086/313] [readtapi] Cleanup usages of ExitOnError, NFCI
 (#76745)

---
 llvm/tools/llvm-readtapi/llvm-readtapi.cpp | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
index 193a281d6341a..4bd47223ea91f 100644
--- a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
+++ b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
@@ -108,20 +108,12 @@ getInterfaceFile(const StringRef Filename, bool ResetBanner = true) {
     LLVM_FALLTHROUGH;
   case file_magic::macho_dynamically_linked_shared_lib_stub:
     LLVM_FALLTHROUGH;
-  case file_magic::macho_universal_binary: {
-    auto IFOrErr = DylibReader::get(Buffer->getMemBufferRef());
-    if (!IFOrErr)
-      ExitOnErr(IFOrErr.takeError());
-    IF = std::move(*IFOrErr);
+  case file_magic::macho_universal_binary:
+    IF = ExitOnErr(DylibReader::get(Buffer->getMemBufferRef()));
     break;
-  }
-  case file_magic::tapi_file: {
-    auto IFOrErr = TextAPIReader::get(Buffer->getMemBufferRef());
-    if (!IFOrErr)
-      ExitOnErr(IFOrErr.takeError());
-    IF = std::move(*IFOrErr);
+  case file_magic::tapi_file:
+    IF = ExitOnErr(TextAPIReader::get(Buffer->getMemBufferRef()));
     break;
-  }
   default:
     reportError(Filename + ": unsupported file type");
   }
@@ -170,10 +162,7 @@ static bool handleMergeAction(const Context &Ctx) {
       Out = std::move(IF);
       continue;
     }
-    auto ResultIF = Out->merge(IF.get());
-    if (!ResultIF)
-      ExitOnErr(ResultIF.takeError());
-    Out = std::move(ResultIF.get());
+    Out = ExitOnErr(Out->merge(IF.get()));
   }
   return handleWriteAction(Ctx, std::move(Out));
 }

From e32b1d15f7a23ccd271764bb31c84d91c9dcddbb Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Tue, 2 Jan 2024 23:37:05 +0000
Subject: [PATCH 087/313] [compiler-rt] enable pwritev interception on
 freebsd/netbsd. (#76759)

---
 .../lib/sanitizer_common/sanitizer_platform_interceptors.h     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 289ae661c343f..0ce4e9351bc1d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -191,7 +191,8 @@
 
 #define SANITIZER_INTERCEPT_PREADV \
   (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
-#define SANITIZER_INTERCEPT_PWRITEV SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_PWRITEV \
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_PREADV64 SI_GLIBC
 #define SANITIZER_INTERCEPT_PWRITEV64 SI_GLIBC
 

From 7122f55c639a00e719b6088249f4fca1810cf04c Mon Sep 17 00:00:00 2001
From: Malavika Samak <malavika.samak@gmail.com>
Date: Tue, 2 Jan 2024 15:41:00 -0800
Subject: [PATCH 088/313] [-Wunsafe-buffer-usage] Warning for unsafe invocation
 of span::data (#75650)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…-Wunsafe-buffer-usage,

there maybe accidental re-introduction of new OutOfBound accesses into
the code bases. One such case is invoking span::data() method on a span
variable to retrieve a pointer, which is then cast to a larger type and
dereferenced. Such dereferences can introduce OutOfBound accesses.

To address this, a new WarningGadget is being introduced to warn against
such invocations.

---------

Co-authored-by: MalavikaSamak <malavika2@apple.com>
---
 .../Analysis/Analyses/UnsafeBufferUsage.h     |   2 +-
 .../Analyses/UnsafeBufferUsageGadgets.def     |   1 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   2 +-
 clang/lib/Analysis/UnsafeBufferUsage.cpp      |  38 ++++-
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  16 ++-
 ...e-buffer-usage-warning-data-invocation.cpp | 131 ++++++++++++++++++
 6 files changed, 182 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp

diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
index 8a2d56668e32f..b28f2c6b99c50 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
@@ -66,7 +66,7 @@ class UnsafeBufferUsageHandler {
 
   /// Invoked when an unsafe operation over raw pointers is found.
   virtual void handleUnsafeOperation(const Stmt *Operation,
-                                     bool IsRelatedToDecl) = 0;
+                                     bool IsRelatedToDecl, ASTContext &Ctx) = 0;
 
   /// Invoked when a fix is suggested against a variable. This function groups
   /// all variables that must be fixed together (i.e their types must be changed
diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
index 757ee452ced74..c976616883651 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
@@ -30,6 +30,7 @@ WARNING_GADGET(Decrement)
 WARNING_GADGET(ArraySubscript)
 WARNING_GADGET(PointerArithmetic)
 WARNING_GADGET(UnsafeBufferUsageAttr)
+WARNING_GADGET(DataInvocation)
 FIXABLE_GADGET(ULCArraySubscript)          // `DRE[any]` in an Unspecified Lvalue Context
 FIXABLE_GADGET(DerefSimplePtrArithFixable)
 FIXABLE_GADGET(PointerDereference)
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index aebb7d9b945c3..e54f969c19039 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12064,7 +12064,7 @@ def warn_unsafe_buffer_variable : Warning<
   InGroup<UnsafeBufferUsage>, DefaultIgnore;
 def warn_unsafe_buffer_operation : Warning<
   "%select{unsafe pointer operation|unsafe pointer arithmetic|"
-  "unsafe buffer access|function introduces unsafe buffer manipulation}0">,
+  "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data}0">,
   InGroup<UnsafeBufferUsage>, DefaultIgnore;
 def note_unsafe_buffer_operation : Note<
   "used%select{| in pointer arithmetic| in buffer access}0 here">;
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 70eec1cee57f8..724c4304a0724 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -721,6 +721,34 @@ class UnsafeBufferUsageAttrGadget : public WarningGadget {
   DeclUseList getClaimedVarUseSites() const override { return {}; }
 };
 
+// Warning gadget for unsafe invocation of span::data method.
+// Triggers when the pointer returned by the invocation is immediately
+// cast to a larger type.
+
+class DataInvocationGadget : public WarningGadget {
+  constexpr static const char *const OpTag = "data_invocation_expr";
+  const ExplicitCastExpr *Op;
+
+public:
+  DataInvocationGadget(const MatchFinder::MatchResult &Result)
+      : WarningGadget(Kind::DataInvocation),
+        Op(Result.Nodes.getNodeAs<ExplicitCastExpr>(OpTag)) {}
+
+  static bool classof(const Gadget *G) {
+    return G->getKind() == Kind::DataInvocation;
+  }
+
+  static Matcher matcher() {
+    return stmt(
+        explicitCastExpr(has(cxxMemberCallExpr(callee(cxxMethodDecl(
+                             hasName("data"), ofClass(hasName("std::span")))))))
+            .bind(OpTag));
+  }
+  const Stmt *getBaseStmt() const override { return Op; }
+
+  DeclUseList getClaimedVarUseSites() const override { return {}; }
+};
+
 // Represents expressions of the form `DRE[*]` in the Unspecified Lvalue
 // Context (see `isInUnspecifiedLvalueContext`).
 // Note here `[]` is the built-in subscript operator.
@@ -2657,8 +2685,8 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
     // every problematic operation and consider it done. No need to deal
     // with fixable gadgets, no need to group operations by variable.
     for (const auto &G : WarningGadgets) {
-      Handler.handleUnsafeOperation(G->getBaseStmt(),
-                                    /*IsRelatedToDecl=*/false);
+      Handler.handleUnsafeOperation(G->getBaseStmt(), /*IsRelatedToDecl=*/false,
+                                    D->getASTContext());
     }
 
     // This return guarantees that most of the machine doesn't run when
@@ -2893,7 +2921,8 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
                   Tracker, Handler, VarGrpMgr);
 
   for (const auto &G : UnsafeOps.noVar) {
-    Handler.handleUnsafeOperation(G->getBaseStmt(), /*IsRelatedToDecl=*/false);
+    Handler.handleUnsafeOperation(G->getBaseStmt(), /*IsRelatedToDecl=*/false,
+                                  D->getASTContext());
   }
 
   for (const auto &[VD, WarningGadgets] : UnsafeOps.byVar) {
@@ -2904,7 +2933,8 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
                                       : FixItList{},
                                       D);
     for (const auto &G : WarningGadgets) {
-      Handler.handleUnsafeOperation(G->getBaseStmt(), /*IsRelatedToDecl=*/true);
+      Handler.handleUnsafeOperation(G->getBaseStmt(), /*IsRelatedToDecl=*/true,
+                                    D->getASTContext());
     }
   }
 }
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 0947e8b0f526a..9eb1df5f02405 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2226,8 +2226,8 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
   UnsafeBufferUsageReporter(Sema &S, bool SuggestSuggestions)
     : S(S), SuggestSuggestions(SuggestSuggestions) {}
 
-  void handleUnsafeOperation(const Stmt *Operation,
-                             bool IsRelatedToDecl) override {
+  void handleUnsafeOperation(const Stmt *Operation, bool IsRelatedToDecl,
+                             ASTContext &Ctx) override {
     SourceLocation Loc;
     SourceRange Range;
     unsigned MsgParam = 0;
@@ -2261,6 +2261,18 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
         // note_unsafe_buffer_operation doesn't have this mode yet.
         assert(!IsRelatedToDecl && "Not implemented yet!");
         MsgParam = 3;
+      } else if (const auto *ECE = dyn_cast<ExplicitCastExpr>(Operation)) {
+        QualType destType = ECE->getType();
+        const uint64_t dSize =
+            Ctx.getTypeSize(destType.getTypePtr()->getPointeeType());
+        if (const auto *CE = dyn_cast<CXXMemberCallExpr>(ECE->getSubExpr())) {
+          QualType srcType = CE->getType();
+          const uint64_t sSize =
+              Ctx.getTypeSize(srcType.getTypePtr()->getPointeeType());
+          if (sSize >= dSize)
+            return;
+        }
+        MsgParam = 4;
       }
       Loc = Operation->getBeginLoc();
       Range = Operation->getSourceRange();
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
new file mode 100644
index 0000000000000..79eb3bb4bacc6
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
@@ -0,0 +1,131 @@
+// RUN: %clang_cc1 -std=c++20  -Wno-all -Wunsafe-buffer-usage \
+// RUN:            -fsafe-buffer-usage-suggestions \
+// RUN:            -fblocks -include %s -verify %s
+
+// RUN: %clang -x c++ -frtti -fsyntax-only -fblocks -include %s %s 2>&1 | FileCheck --allow-empty %s
+// RUN: %clang_cc1 -std=c++11 -fblocks -include %s %s 2>&1 | FileCheck --allow-empty %s
+// RUN: %clang_cc1 -std=c++20 -fblocks -include %s %s 2>&1 | FileCheck --allow-empty %s
+// CHECK-NOT: [-Wunsafe-buffer-usage]
+
+#ifndef INCLUDED
+#define INCLUDED
+#pragma clang system_header
+
+// no spanification warnings for system headers
+#else
+
+namespace std {
+  class type_info;
+  class bad_cast;
+  class bad_typeid;
+}
+using size_t = __typeof(sizeof(int));
+void *malloc(size_t);
+
+void foo(int v) {
+}
+
+void foo(int *p){}
+
+namespace std{
+  template <typename T> class span {
+
+  T *elements;
+ 
+  span(T *, unsigned){}
+
+  public:
+
+  constexpr span<T> subspan(size_t offset, size_t count) const {
+    return span<T> (elements+offset, count); // expected-warning{{unsafe pointer arithmetic}}
+  }
+
+  constexpr T* data() const noexcept {
+    return elements;
+  }
+
+ 
+  constexpr T* hello() const noexcept {
+   return elements;
+  }
+};
+ 
+ template <typename T> class span_duplicate {
+  span_duplicate(T *, unsigned){}
+
+  T array[10];
+
+  public:
+
+  T* data() {
+    return array;
+  }
+
+};
+}
+
+using namespace std;
+
+class A {
+  int a, b, c;
+};
+
+class B {
+  int a, b, c;
+};
+
+struct Base {
+   virtual ~Base() = default;
+};
+
+struct Derived: Base {
+  int d;
+};
+
+void cast_without_data(int *ptr) {
+ A *a = (A*) ptr;
+ float *p = (float*) ptr;
+}
+
+void warned_patterns(std::span<int> span_ptr, std::span<Base> base_span, span<int> span_without_qual) {
+    A *a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of span::data}}
+    a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of span::data}}
+  
+    A *a2 = (A*) span_without_qual.data(); // expected-warning{{unsafe invocation of span::data}}
+   
+    // TODO:: Should we warn when we cast from base to derived type?
+    Derived *b = dynamic_cast<Derived*> (base_span.data());// expected-warning{{unsafe invocation of span::data}}
+
+   // TODO:: This pattern is safe. We can add special handling for it, if we decide this
+   // is the recommended fixit for the unsafe invocations.
+   A *a3 = (A*)span_ptr.subspan(0, sizeof(A)).data(); // expected-warning{{unsafe invocation of span::data}}
+}
+
+void not_warned_patterns(std::span<A> span_ptr, std::span<Base> base_span) {
+    int *p = (int*) span_ptr.data(); // Cast to a smaller type
+  
+    B *b = (B*) span_ptr.data(); // Cast to a type of same size.
+
+    p = (int*) span_ptr.data();
+    A *a = (A*) span_ptr.hello(); // Invoking other methods.
+}
+
+// We do not want to warn about other types
+void other_classes(std::span_duplicate<int> span_ptr) {
+    int *p;
+    A *a = (A*)span_ptr.data();
+    a = (A*)span_ptr.data(); 
+}
+
+// Potential source for false negatives
+
+A false_negatives(std::span<int> span_pt, span<A> span_A) {
+  int *ptr = span_pt.data();
+
+  A *a1 = (A*)ptr; //TODO: We want to warn here eventually.
+
+  A *a2= span_A.data();
+  return *a2; // TODO: Can cause OOB if span_pt is empty
+
+}
+#endif

From 1a8fb887197caf709710bedf88ce95ffb0605c56 Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <boian.petkantchin@amd.com>
Date: Tue, 2 Jan 2024 15:50:07 -0800
Subject: [PATCH 089/313] [mlir][mesh] Add resharding spmdization on a 1D
 device mesh (#76179)

The current implementation supports only sharding of tensor axes that
have size divisible by the mesh axis size.
---
 mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td |  11 +-
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h   |   3 +
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td  |  48 +-
 .../Transforms/ReshardingSpmdizationDoc.md    | 683 ++++++++++++++++++
 .../Dialect/Mesh/Transforms/Spmdization.h     |  35 +
 mlir/lib/Dialect/Mesh/IR/MeshOps.cpp          | 213 ++++--
 .../Dialect/Mesh/Transforms/CMakeLists.txt    |   3 +
 .../Dialect/Mesh/Transforms/Spmdization.cpp   | 639 ++++++++++++++++
 mlir/test/Dialect/Mesh/invalid.mlir           |  96 +++
 mlir/test/Dialect/Mesh/ops.mlir               |  49 ++
 .../Dialect/Mesh/resharding-spmdization.mlir  | 154 ++++
 mlir/test/lib/Dialect/Mesh/CMakeLists.txt     |   1 +
 .../Mesh/TestReshardingSpmdization.cpp        | 122 ++++
 mlir/tools/mlir-opt/mlir-opt.cpp              |   2 +
 14 files changed, 2004 insertions(+), 55 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc.md
 create mode 100644 mlir/include/mlir/Dialect/Mesh/Transforms/Spmdization.h
 create mode 100644 mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
 create mode 100644 mlir/test/Dialect/Mesh/resharding-spmdization.mlir
 create mode 100644 mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp

diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
index 9d39b1b3329fb..a9d30dfbb9a76 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
@@ -33,6 +33,10 @@ def Mesh_Dialect : Dialect {
   let useDefaultAttributePrinterParser = 1;
   let hasConstantMaterializer = 1;
 }
+
+def Mesh_MeshAxis : I<16>;
+def Mesh_MeshAxesAttr : DenseArrayAttrBase<"DenseI16ArrayAttr", "int16_t", "i16">;
+
 //===----------------------------------------------------------------------===//
 // Mesh Enums.
 //===----------------------------------------------------------------------===//
@@ -125,7 +129,7 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
 
     // The tensor is sharded on the first dimension along axis 0 of @mesh0 and
     // it is also a partial_sum along mesh axis 1.
-    tensor<4x8xf32, #mesh.shard<@mesh0, [[0], [], [1]]>
+    tensor<4x8xf32, #mesh.shard<@mesh0, [[0], []], partial = sum[1]>
 
     // The tensor is sharded on the first dimension along axis 0 of @mesh0 and
     // it is also a partial_max along mesh axis 1.
@@ -158,6 +162,11 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
     }]>
   ];
 
+  let extraClassDeclaration = [{
+    bool operator==(::mlir::Attribute rhs) const;
+    bool operator==(::mlir::mesh::MeshShardingAttr rhs) const;
+  }];
+
   let genVerifyDecl = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
index 9077d2eb0189b..ce7d5d045122d 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
@@ -30,6 +30,9 @@
 namespace mlir {
 namespace mesh {
 
+using MeshAxis = int16_t;
+using MeshAxesAttr = DenseI16ArrayAttr;
+
 bool isReductionLoop(IteratorType iType);
 
 bool areReductionAndPartialMatch(IteratorType iType, Partial partial);
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
index 784f3eb97763a..1ed54b6519e4d 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_MESH_IR_MESHOPS_TD
 
 include "mlir/Dialect/Mesh/IR/MeshBase.td"
+include "mlir/Dialect/Shape/IR/ShapeBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/BuiltinTypes.td"
@@ -95,6 +96,28 @@ def Mesh_ClusterOp : Mesh_Op<"cluster", [Symbol]> {
   let hasVerifier = 1;
 }
 
+def Mesh_ClusterShapeOp : Mesh_Op<"cluster_shape", [Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let summary = "Get the shape of the cluster.";
+  let arguments = (ins
+    FlatSymbolRefAttr:$mesh,
+    DefaultValuedAttr<Mesh_MeshAxesAttr, "{}">:$axes
+  );
+
+  let results = (outs
+    Variadic<Index>:$result
+  );
+
+  let assemblyFormat = [{
+    $mesh (`axes` `=` $axes^)?
+    attr-dict `:` type($result)
+  }];
+
+  let builders = [
+    OpBuilder<(ins "::mlir::mesh::ClusterOp":$mesh)>,
+    OpBuilder<(ins "StringRef":$mesh, "ArrayRef<int16_t>":$axes)>
+  ];
+}
+
 def Mesh_ShardOp : Mesh_Op<"shard", [Pure, SameOperandsAndResultType]> {
   let summary = "Annotate on how a tensor is sharded across a mesh cluster.";
   let description = [{
@@ -186,6 +209,29 @@ def Mesh_ShardOp : Mesh_Op<"shard", [Pure, SameOperandsAndResultType]> {
   }];
 }
 
+def Mesh_ProcessIndexOp : Mesh_Op<"process_index", [Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let summary = "Get the index of current device along specified mesh axis.";
+  let description = [{
+    It is used in the SPMD format of IR.
+    The `axes` mush be non-negative and less than the total number of mesh axes.
+  }];
+  let arguments = (ins
+    FlatSymbolRefAttr:$mesh,
+    DefaultValuedAttr<Mesh_MeshAxesAttr, "{}">:$axes
+  );
+  let results = (outs
+    Variadic<Index>:$result
+  );
+  let assemblyFormat = [{
+    `on` $mesh (`axes` `=` $axes^)?
+    attr-dict `:` type($result)
+  }];
+  let builders = [
+    OpBuilder<(ins "::mlir::mesh::ClusterOp":$mesh)>,
+    OpBuilder<(ins "StringRef":$mesh, "ArrayRef<int16_t>":$axes)>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // collective communication ops
 //===----------------------------------------------------------------------===//
@@ -197,7 +243,7 @@ class Mesh_CollectiveCommunicationOpBase<
       [DeclareOpInterfaceMethods<SymbolUserOpInterface>])> {
   dag commonArgs = (ins
     FlatSymbolRefAttr:$mesh,
-    DefaultValuedAttr<DenseI16ArrayAttr, "{}">:$mesh_axes
+    DefaultValuedAttr<Mesh_MeshAxesAttr, "{}">:$mesh_axes
   );
 }
 
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc.md b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc.md
new file mode 100644
index 0000000000000..6368931cf6e07
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc.md
@@ -0,0 +1,683 @@
+# Resharding Spmdization Examples
+
+Reshard `2x3` tensor from sharding `[[0, 1]]` to sharding `[[0, 1]]` on a `2x3` mesh.
+
+unsharded `2x3` tensor
+```
+11 12 13
+21 22 23
+```
+
+sharded on a `2x3` mesh
+
+sharding = `[[0, 1]]`
+
+mesh contents:
+
+```
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 |             |
++----+----+----+             |
+| 21 | 22 | 23 |             |
++----+----+----+             ↓
+```
+
+Transform into
+sharding = `[[1, 0]]`
+```
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 13 | 22 |             |
++----+----+----+             |
+| 12 | 21 | 23 |             |
++----+----+----+             ↓
+```
+Algorithm:
+Swap contents on devices that have the same linear index in the 2 shardings.
+
+--------------------------------------------------------------
+
+Reshard `2x3` tensor from sharding `[[0, 1]]` to sharding `[[1]]` on a `2x3` mesh.
+
+unsharded `2x3` tensor
+```
+11 12 13
+21 22 23
+```
+
+sharded on a `2x3` mesh
+
+sharding = `[[0, 1]]`
+
+mesh contents:
+```
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 |             |
++----+----+----+             |
+| 21 | 22 | 23 |             |
++----+----+----+             ↓
+```
+
+Transform into
+sharding = `[[1]]`
+```
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 |             |
+| 21 | 22 | 23 |             |
++----+----+----+             |
+| 11 | 12 | 13 |             |
+| 21 | 22 | 23 |             |
++----+----+----+             ↓
+```
+Algorithm:
+All-gather along mesh axis 0.
+
+--------------------------------------------------------------
+
+Reshard `4x6` tensor from sharding `[[], [0, 1]]` to sharding `[[], [0]]` on a `2x3` mesh.
+
+unsharded `4x6` tensor
+```
+11 12 13 14 15 16
+21 22 23 24 25 26
+```
+
+sharded on a `2x3` mesh
+
+sharding = `[[], [0, 1]]`
+
+mesh contents:
+```
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 |             |
+| 21 | 22 | 23 |             |
++----+----+----+             |
+| 14 | 15 | 16 |             |
+| 24 | 25 | 26 |             |
++----+----+----+             ↓
+```
+Transform into
+sharding = `[[], [0]]`
+```
+mesh axis 1
+----------->
++----------+----------+ mesh axis 0 |
+| 11 12 13 | 11 12 13 |             |
+| 21 22 23 | 21 22 23 |             |
++----------+----------+             |
+| 14 15 16 | 14 15 16 |             |
+| 24 25 26 | 24 25 26 |             |
++----------+----------+             ↓
+```
+Algorithm:
+All-gather along mesh axis 1.
+
+--------------------------------------------------------------
+
+Reshard `4x8` tensor from sharding `[[0], [1, 2]]` to sharding `[[0], [2]]` on a `2x2x2` mesh.
+
+unsharded `4x8` tensor
+```
+11 12 13 14 15 16 17 18
+21 22 23 24 25 26 27 28
+31 32 33 34 35 36 37 38
+41 42 43 44 45 46 47 48
+```
+sharded on a `2x2x2` mesh
+
+sharding = `[[0], [1, 2]]`
+
+mesh contents:
+```
+mesh axis 2
+----------->
++-------+-------+ mesh axis 1 | mesh axis 0 |
+| 11 12 | 13 14 |             |             |
+| 21 22 | 23 24 |             |             |
++-------+-------+             |             |
+| 15 16 | 17 18 |             |             |
+| 25 26 | 27 28 |             |             |
++-------+-------+             ↓             |
++-------+-------+                           |
+| 31 32 | 33 34 |                           |
+| 41 42 | 43 44 |                           |
++-------+-------+                           |
+| 35 36 | 37 38 |                           |
+| 45 46 | 47 48 |                           |
++-------+-------+                           ↓
+```
+Transform into
+sharding = `[[0], [2]]`
+```
+mesh axis 2
+----------->
++-------------+-------------+ mesh axis 1 | mesh axis 0 |
+| 11 12 13 14 | 15 16 17 18 |             |             |
+| 21 22 23 24 | 25 26 27 28 |             |             |
++-------------+-------------+             |             |
+| 11 12 13 14 | 15 16 17 18 |             |             |
+| 21 22 23 24 | 25 26 27 28 |             |             |
++-------------+-------------+             ↓             |
++-------------+-------------+                           |
+| 31 32 33 34 | 35 36 37 38 |                           |
+| 41 42 43 44 | 45 46 47 48 |                           |
++-------------+-------------+                           |
+| 31 32 33 34 | 35 36 37 38 |                           |
+| 41 42 43 44 | 45 46 47 48 |                           |
++-------------+-------------+                           ↓
+```
+Algorithm:
+
+Can't be done with just an all-gather along mesh axis 1.
+Can be handled by multiple resharding transformations
+`[[0], [1, 2]] -> [[0], [2, 1]] -> [[0], [2]]`
+
+--------------------------------------------------------------
+
+Reshard `6x6` tensor from sharding `[[0], [1]]` to sharding `[[1], [0]]` on a `2x3` mesh.
+
+unsharded `6x6` tensor
+```
+11 12 13 14 15 16
+21 22 23 24 25 26
+31 32 33 34 35 36
+41 42 43 44 45 46
+51 52 53 54 55 56
+61 62 63 64 65 66
+```
+sharded on a `2x3` mesh
+
+sharding = `[[0], [1]]`
+```
+mesh axis 1
+----------->
++-------+-------+-------+ mesh axis 0 |
+| 11 12 | 13 14 | 15 16 |             |
+| 21 22 | 23 24 | 25 26 |             |
+| 31 32 | 33 34 | 35 36 |             |
++-------+-------+-------+             |
+| 41 42 | 43 44 | 45 46 |             |
+| 51 52 | 53 54 | 55 56 |             |
+| 61 62 | 63 64 | 65 66 |             |
++-------+-------+-------+             ↓
+```
+transform to
+sharding = `[[1], [0]]`
+```
+mesh axis 1
+----------->
++----------+----------+----------+ mesh axis 0 |
+| 11 12 13 | 31 32 33 | 51 52 53 |             |
+| 21 22 23 | 41 42 43 | 61 62 63 |             |
++----------+----------+----------+             |
+| 14 15 16 | 34 35 36 | 54 55 56 |             |
+| 24 25 26 | 44 45 46 | 64 65 66 |             |
++----------+----------+----------+             ↓
+
+mesh axis 0
+----------->
++----------+----------+ mesh axis 1 |
+| 11 12 13 | 14 15 16 |             |
+| 21 22 23 | 24 25 26 |             |
++----------+----------+             |
+| 31 32 33 | 34 35 36 |             |
+| 41 42 43 | 44 45 46 |             |
++----------+----------+             |
+| 51 52 53 | 54 55 56 |             |
+| 61 62 63 | 64 65 66 |             |
++----------+----------+             ↓
+```
+Algorithm: TODO
+
+--------------------------------------------------------------
+
+Reshard `6x6` tensor from sharding `[[0], [1]]` to sharding `[[1], [0]]` on a `2x6` mesh.
+
+unsharded 6x6 tensor
+```
+11 12 13 14 15 16
+21 22 23 24 25 26
+31 32 33 34 35 36
+41 42 43 44 45 46
+51 52 53 54 55 56
+61 62 63 64 65 66
+```
+shard on `2x6` mesh
+
+sharding = `[[0], [1]]`
+```
+mesh axis 1
+----------->
++----+----+----+----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 ‖ 14 | 15 | 16 |             |
+| 21 | 22 | 23 ‖ 24 | 23 | 26 |             |
+| 31 | 32 | 33 ‖ 34 | 35 | 36 |             |
++----+----+----+----+----+----+             |
+| 41 | 42 | 43 ‖ 44 | 45 | 46 |             |
+| 51 | 52 | 53 ‖ 54 | 55 | 56 |             |
+| 61 | 62 | 63 ‖ 64 | 65 | 66 |             |
++----+----+----+----+----+----+             ↓
+```
+transform to
+sharding = `[[1], [0]]`
+```
+mesh axis 0
+----------->
++----------+----------+ mesh axis 1 |
+| 11 12 13 | 14 15 16 |             |
++----------+----------+             |
+| 21 22 23 | 24 25 26 |             |
++----------+----------+             |
+| 31 32 33 | 34 35 36 |             |
++==========+==========+             |
+| 41 42 43 | 44 45 46 |             |
++----------+----------+             |
+| 51 52 53 | 54 55 56 |             |
++----------+----------+             |
+| 61 62 63 | 64 65 66 |             |
++----------+----------+             ↓
+```
+Algorithm: TODO
+
+--------------------------------------------------------------
+
+Reshard KxL tensor from `[[0], [1]]` to `[[1], [0]]` on `MxN` mesh.
+
+`M x N` mesh.
+`K x L` tensor `t`.
+`d(m, n)` the tensor on device `(m, n)`.
+
+sharding = `[[0], [1]]`
+Tensor shard s on each device has size `(K ceildiv M, L ceildiv N)`.
+```
+d(m, n)[k, l] -> t[m * (K ceildiv M) + k, n * (L ceildiv N) + l]
+```
+substitute
+```
+i <- m * (K ceildiv M) + k
+j <- n * (L ceildiv N) + l
+```
+```
+m -> i floordiv (K ceildiv M)
+n -> j floordiv (L ceildiv N)
+k -> i % (K ceildiv M)
+l -> j % (L ceildiv N)
+```
+For the inverse map we get
+```
+t[i, j] -> d(
+    i floordiv (K ceildiv M), j floordiv (L ceildiv N)
+)[
+    i % (K ceildiv M), j % (L ceildiv N)
+]
+```
+Check:
+```
+i = 13, j = 17, M = 3, N = 4, K = 16, L = 23
+t[13, 17] = d(
+    13 floordiv (16 ceildiv 3),
+    17 floordiv (23 ceilvid 4)
+)[
+    13 % (16 ceildiv 3),
+    17 % (23 ceilvid 4)
+]
+= d(
+    13 floordiv 6,
+    17 floordiv 6
+)[
+    13 % 6,
+    17 % 6
+]
+= d(2, 2)[1, 5]
+= t[
+    2 * (16 ceildiv 3) + 1,
+    2 * (23 ceildiv 4) + 5
+]
+= t[
+    2 * 6 + 1,
+    2 * 6 + 5
+]
+= t[13, 17]
+```
+
+sharding = `[[1], [0]]`
+Tensor shard s on each device has size `(K ceildiv N, L ceildiv M)`.
+```
+d(m, n)[k, l] -> t[n * (K ceildiv N) + k, m * (L ceildiv M) + l]
+```
+substitute
+```
+i <- n * (K ceildiv N) + k
+j <- m * (L ceildiv M) + l
+```
+```
+m -> j floordiv (L ceildiv M)
+n -> i floordiv (K ceildiv N)
+k -> i % (K ceildiv N)
+l -> j % (L ceildiv M)
+```
+For the inverse map we get
+```
+t[i, j] -> d(
+    j floordiv (L ceildiv M), i floordiv (K ceildiv N)
+)[
+    i % (K ceildiv N), j % (L ceildiv M)
+]
+```
+Check:
+```
+i = 9, j = 19, M = 5, N = 2, K = 27, L = 14
+t[9, 19] = d(
+    19 floordiv (14 ceildiv 5),
+    9 floordiv (27 ceildiv 2)
+)[
+    9 % (27 ceildiv 2),
+    19 % (14 ceildiv 5)
+]
+= d(
+    19 floordiv 3,
+    9 floordiv 14
+)[
+    9 % 14
+    19 % 3
+]
+= d(6, 0)[9, 1]
+= t[
+    0 * (27 ceildiv 2) + 9,
+    6 * (14 ceildiv 5) + 1
+]
+= t[
+    0 * 14 + 9,
+    6 * 3 + 1
+]
+= t[9, 19]
+```
+sharding = `[[0], [1]]`
+```
+d(m, n)[k, l] -> t[m * (K ceildiv M) + k, n * (L ceildiv N) + l]
+t[i, j] -> d(i floordiv (K ceildiv M), j floordiv (L ceildiv N))[i % (K ceildiv M), j % (L ceildiv N)]
+```
+sharding = `[[1], [0]]`
+```
+d(m, n)[k, l] -> t[n * (K ceildiv N) + k, m * (L ceildiv M) + l]
+t[i, j] -> d(j floordiv (L ceildiv M), i floordiv (K ceildiv N))[i % (K ceildiv N), j % (L ceildiv M)]
+```
+sharding `[[0], [1]] -> [[1], [0]]`
+`d1(m, n)` the tensor on device `(m, n)` for sharding sharding `[[0], [1]]`.
+`d2(m, n)` the tensor on device `(m, n)` for sharding sharding `[[1], [0]]`.
+```
+d1(m, n)[k, l] ->
+t[m * (K ceildiv M) + k, n * (L ceildiv N) + l] ->
+d2(
+    (m * (L ceildiv M) + l) floordiv (L ceildiv M),
+    (n * (K ceildiv N) + k) floordiv (K ceildiv N)
+)[
+    (n * (K ceildiv N) + k) % (K ceildiv N),
+    (m * (L ceildiv M) + l) % (L ceildiv M)
+]
+= d2(p, q)[u, v]
+```
+We want to copy the the data between devices in slices/tiles.
+What are the source/target tile coordinates?
+For a fixed `(m, n, p, q)` what is the range of `(k, l, u, v)`?
+TODO
+
+--------------------------------------------------------------
+
+Reshard `KxL` tensor from sharding `[[0], [1]]` to sharding `[[1], [0]]` on a `2x3` mesh.
+
+Device placement on a `2x3` mesh
+```
+11 12 13  <- devices
+21 22 23
+```
+sharding `[[0], [1]]`
+```
+tensor axis 1
+----------->
++----+----+----+ tensor axis 0 |
+| 11 | 12 | 13 |               |
++----+----+----+               |
+| 21 | 22 | 23 |               |
++----+----+----+               ↓
+```
+transform to
+sharding `[[1], [0]]`
+```
+tensor axis 1
+----------->
++----+----+ tensor axis 0 |
+| 11 | 21 |               |
++----+----+               |
+| 12 | 22 |               |
++----+----+               |
+| 13 | 23 |               |
++----+----+               ↓
+```
+```
++-----------------+--------+--------+-----------------+
+|                 |                 |                 |
++                 +                 +                 +
+|       11        |        12       |        13       |
++                 +                 +                 +
+|                 |                 |                 |
++-----------------+--------+--------+-----------------+
+|                 |                 |                 |
++                 +                 +                 +
+|       21        |        22       |        23       |
++                 +                 +                 +
+|                 |                 |                 |
++-----------------+--------+--------+-----------------+
+
++-----------------+--------+--------+-----------------+
+|                          |                          |
++          11              +               21         +
+|                          |                          |
++-----------------+--------+--------+-----------------+
+|                          |                          |
++          12              +               22         +
+|                          |                          |
++-----------------+--------+--------+-----------------+
+|                          |                          |
++          13              +               23         +
+|                          |                          |
++-----------------+--------+--------+-----------------+
+
++-----------------+--------+--------+-----------------+
+|                 |        |        |                 |
++     11  11      + 12  11 + 12  21 +     13  21      +
+|                 |        |        |                 |
++-----------------+--------+--------+-----------------+
+|     11  12      | 12  12 | 12  22 |     13  22      |
++-----------------+--------+--------+-----------------+
+|     21  12      | 22  12 | 22  22 |     23  22      |
++-----------------+--------+--------+-----------------+
+|                 |        |        |                 |
++     21  13      + 22  13 + 22  23 +     23  23      +
+|                 |        |        |                 |
++-----------------+--------+--------+-----------------+
+```
+If `S` and `T` are the source and target shard sizes along some tensor axis.
+Then we have a period of `(S*T)/gcd(S, T)`. Then the cut pattern repeats.
+TODO
+
+--------------------------------------------------------------
+
+Reshard `6x6` tensor from sharding `[[0], []]` to sharding `[[], [0]]` on a `3` mesh.
+
+unsharded `6x6` tensor
+```
+11 12 13 14 15 16
+21 22 23 24 25 26
+31 32 33 34 35 36
+41 42 43 44 45 46
+51 52 53 54 55 56
+61 62 63 64 65 66
+```
+sharded on a `3` mesh
+
+sharding = `[[0], []]`
+```
++-------------------+ mesh axis 0 |
+| 11 12 13 14 15 16 |             |
+| 21 22 23 24 25 26 |             |
++-------------------+             |
+| 31 32 33 34 35 36 |             |
+| 41 42 43 44 45 46 |             |
++-------------------+             |
+| 51 52 53 54 55 56 |             |
+| 61 62 63 64 65 66 |             |
++-------------------+             ↓
+```
+transform to
+sharding = `[[], [0]]`
+```
+mesh axis 0
+----------->
++-------+-------+-------+
+| 11 12 | 13 14 | 15 16 |
+| 21 22 | 23 24 | 25 26 |
+| 31 32 | 33 34 | 35 36 |
+| 41 42 | 43 44 | 45 46 |
+| 51 52 | 53 54 | 55 56 |
+| 61 62 | 63 64 | 65 66 |
++-------+-------+-------+
+```
+Algorithm:
+```mlir
+%1 = all_to_all %0 on @mesh mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<2x6xi8> -> tensor<6x2xi8>
+```
+--------------------------------------------------------------
+
+Reshard `4x4` tensor from sharding `[[0], [1, 2]]` to sharding `[[0, 1], [2]]` on a `2x2x2` mesh.
+
+unsharded `4x4` tensor
+```
+11 12 13 14
+21 22 23 24
+31 32 33 34
+41 42 43 44
+```
+sharded on a `2x2x2` mesh
+
+sharding = `[[0], [1, 2]]`
+```
+mesh axis 2
+----------->
++----+----+ mesh axis 1 | mesh axis 0 |
+| 11 | 12 |             |             |
+| 21 | 22 |             |             |
++----+----+             |             |
+| 13 | 14 |             |             |
+| 23 | 24 |             |             |
++----+----+             ↓             |
++----+----+                           |
+| 31 | 32 |                           |
+| 41 | 42 |                           |
++----+----+                           |
+| 33 | 34 |                           |
+| 43 | 44 |                           |
++----+----+                           ↓
+```
+transform to
+sharding = `[[0, 1], [2]]`
+```
+mesh axis 2
+----------->
++-------+-------+ mesh axis 1 | mesh axis 0 |
+| 11 12 | 13 41 |             |             |
++-------+-------+             |             |
+| 21 22 | 23 24 |             |             |
++-------+-------+             ↓             |
++-------+-------+                           |
+| 31 32 | 33 34 |                           |
++-------+-------+                           |
+| 41 42 | 43 44 |                           |
++-------+-------+                           ↓
+```
+Algorithm:
+```mlir
+%1 = all_to_all %0 on @mesh mesh_axes = [2] split_axis = 1 concat_axis = 0 : tensor<2x1xi8> -> tensor<1x2xi8>
+```
+is not enough.
+
+Can be decomposed into
+```
+[[0], [1, 2]] -> [[0], [2, 1]] -> [[0, 1], [2]]
+```
+
+## Decomposition into basis of reshardings
+
+We can decompose each resharding into a sequence of basis reshardings.
+It is not communication efficient in terms of minimizing the data communicated
+between devices.
+An efficient approach would be more complicated to implement.
+Each device has to receive at most as much data as the size of its target
+sharding tensor.
+
+--------------------------------------------------------------
+
+Basis:
+
+*   From replicate to split.
+    ```
+    [[]] -> [[1]]
+    ```
+    Extract slices without communication.
+
+* From split to replicate.
+    ```
+    [[0]] -> [[]]
+    [[0, 1]] -> [[1]]
+    ```
+    All-gather along mesh axis 0.
+
+*   Swap mesh axes order when assigned to the same tensor axis.
+    ```
+    [[0, 1]] -> [[1, 0]]
+    ```
+    Swap contents on devices with the same linear index.
+
+*   Move mesh axis to different tensor dimension.
+    ```
+    [[0], []] -> [[], [0]]
+    ```
+    All-to-all.
+
+--------------------------------------------------------------
+
+Example decomposition of
+```
+[[0], [1]] -> [[1], [0]]
+```
+into
+```
+[[0], [1]] -> all-gather along mesh axis 1    ->
+[[0], []]  -> all-to-all along mesh axis 0    ->
+[[], [0]]  -> extract slice along mesh axis 1 ->
+[[1], [0]]
+```
+
+--------------------------------------------------------------
+
+Example decomposition of
+```
+[[3, 2], [], [0, 1]] -> [[0], [1, 2], []]
+```
+into
+```
+[[3, 2], [], [0, 1]] -> all-to-all along mesh axis 1 ->
+[[3, 2], [1], [0]]   -> all-to-all along mesh axis 2 ->
+[[3], [1, 2], [0]]   -> all-gather along mesh axis 3 ->
+[[], [1, 2], [0]]    -> all-to-all along mesh axis 0 ->
+[[0], [1, 2], []]
+```
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Spmdization.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Spmdization.h
new file mode 100644
index 0000000000000..f71bb9b262a38
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Spmdization.h
@@ -0,0 +1,35 @@
+//===- Simplifications.h - Mesh Simplifications -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_MESH_TRANSFORMS_SPMDIZATION_H
+#define MLIR_DIALECT_MESH_TRANSFORMS_SPMDIZATION_H
+
+#include "mlir/Dialect/Mesh/IR/MeshOps.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace mesh {
+
+// Return the sharded shape `shape` acording ot sharding `sharding`.
+ShapedType shardShapedType(ShapedType shape, ClusterOp mesh,
+                           MeshShardingAttr sharding);
+
+// Insert resharding spmdization of the value `sourceShardValue`
+// from sharding `source` to sharding `target`.
+// `sourceShardValue` is the already sharded value according to `source`.
+TypedValue<ShapedType> reshard(OpBuilder &builder, ClusterOp mesh,
+                               ShardOp source, ShardOp target,
+                               TypedValue<ShapedType> sourceShardValue);
+
+void reshardingRegisterDependentDialects(DialectRegistry &registry);
+
+} // namespace mesh
+} // namespace mlir
+
+#endif // MLIR_DIALECT_MESH_TRANSFORMS_SPMDIZATION_H
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index d27675564c646..de4f58d54e8ca 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -9,8 +9,10 @@
 #include "mlir/Dialect/Mesh/IR/MeshOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Location.h"
@@ -59,8 +61,6 @@ static SmallVector<T> &canonicalizeSetAsVector(SmallVector<T> &vec) {
   return vec;
 }
 
-using MeshAxis = int16_t;
-
 namespace {
 
 struct DimensionSize {
@@ -114,6 +114,56 @@ Operation *MeshDialect::materializeConstant(OpBuilder &builder, Attribute value,
 // Mesh utilities
 //===----------------------------------------------------------------------===//
 
+static FailureOr<ClusterOp> getMesh(Operation *op, FlatSymbolRefAttr meshSymbol,
+                                    SymbolTableCollection &symbolTable) {
+  mesh::ClusterOp mesh =
+      symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(op, meshSymbol);
+  if (!mesh) {
+    return op->emitError() << "Undefined required mesh symbol \""
+                           << meshSymbol.getValue() << "\".";
+  }
+
+  return mesh;
+}
+
+template <typename It>
+bool isUnique(It begin, It end) {
+  if (begin == end) {
+    return true;
+  }
+  It next = std::next(begin);
+  if (next == end) {
+    return true;
+  }
+  for (; next != end; ++next, ++begin) {
+    if (*begin == *next) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static LogicalResult verifyMeshAxes(Location loc, ArrayRef<MeshAxis> axes,
+                                    ClusterOp mesh) {
+  SmallVector<MeshAxis> sorted = llvm::to_vector(axes);
+  llvm::sort(sorted);
+  if (!isUnique(sorted.begin(), sorted.end())) {
+    return emitError(loc) << "Mesh axes contains duplicate elements.";
+  }
+
+  MeshAxis rank = mesh.getRank();
+  for (auto axis : axes) {
+    if (axis >= rank || axis < 0) {
+      return emitError(loc)
+             << "0-based mesh axis index " << axis
+             << " is out of bounds. The referenced mesh \"" << mesh.getSymName()
+             << "\" is of rank " << rank << ".";
+    }
+  }
+
+  return success();
+}
+
 bool mesh::isReductionLoop(IteratorType iType) {
   return iType != IteratorType::Parallel && iType != IteratorType::Invalid;
 }
@@ -173,7 +223,45 @@ SmallVector<int64_t> ClusterOp::canonicalDimSizes() {
 }
 
 //===----------------------------------------------------------------------===//
-// mesh.shard op
+// mesh.cluster_shape op
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+ClusterShapeOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  auto mesh = ::getMesh(getOperation(), getMeshAttr(), symbolTable);
+  if (failed(mesh)) {
+    return failure();
+  }
+  if (failed(verifyMeshAxes(getLoc(), getAxes(), mesh.value()))) {
+    return failure();
+  }
+
+  size_t expectedResultsCount =
+      getAxes().empty() ? mesh->getRank() : getAxes().size();
+  if (getResult().size() != expectedResultsCount) {
+    return emitError() << "Unexpected number of results " << getResult().size()
+                       << ". Expected " << expectedResultsCount << ".";
+  }
+
+  return success();
+}
+
+void ClusterShapeOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                           ClusterOp mesh) {
+  build(odsBuilder, odsState,
+        SmallVector<Type>(mesh.getRank(), odsBuilder.getIndexType()),
+        mesh.getSymName(), MeshAxesAttr());
+}
+
+void ClusterShapeOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                           StringRef mesh, ArrayRef<MeshAxis> axes) {
+  build(odsBuilder, odsState,
+        SmallVector<Type>(axes.size(), odsBuilder.getIndexType()), mesh,
+        MeshAxesAttr::get(odsBuilder.getContext(), axes));
+}
+
+//===----------------------------------------------------------------------===//
+// mesh.shard attr
 //===----------------------------------------------------------------------===//
 
 LogicalResult
@@ -205,6 +293,75 @@ MeshShardingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+bool MeshShardingAttr::operator==(Attribute rhs) const {
+  MeshShardingAttr rhsAsMeshShardingAttr = rhs.dyn_cast<MeshShardingAttr>();
+  return rhsAsMeshShardingAttr && *this == rhsAsMeshShardingAttr;
+}
+
+bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const {
+  if (getCluster() != rhs.getCluster() ||
+      getPartialAxes() != rhs.getPartialAxes()) {
+    return false;
+  }
+
+  if (!getPartialAxes().empty() && getPartialType() != rhs.getPartialType()) {
+    return false;
+  }
+
+  auto minSize = std::min(getSplitAxes().size(), rhs.getSplitAxes().size());
+  if (!llvm::equal(llvm::make_range(getSplitAxes().begin(),
+                                    getSplitAxes().begin() + minSize),
+                   llvm::make_range(rhs.getSplitAxes().begin(),
+                                    rhs.getSplitAxes().begin() + minSize))) {
+    return false;
+  }
+
+  return llvm::all_of(llvm::make_range(getSplitAxes().begin() + minSize,
+                                       getSplitAxes().end()),
+                      std::mem_fn(&DenseI32ArrayAttr::empty)) &&
+         llvm::all_of(llvm::make_range(rhs.getSplitAxes().begin() + minSize,
+                                       rhs.getSplitAxes().end()),
+                      std::mem_fn(&DenseI32ArrayAttr::empty));
+}
+
+//===----------------------------------------------------------------------===//
+// mesh.process_index op
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+ProcessIndexOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  auto mesh = ::getMesh(getOperation(), getMeshAttr(), symbolTable);
+  if (failed(mesh)) {
+    return failure();
+  }
+  if (failed(verifyMeshAxes(getLoc(), getAxes(), mesh.value()))) {
+    return failure();
+  }
+
+  size_t expectedResultsCount =
+      getAxes().empty() ? mesh->getRank() : getAxes().size();
+  if (getResult().size() != expectedResultsCount) {
+    return emitError() << "Unexpected number of results " << getResult().size()
+                       << ". Expected " << expectedResultsCount << ".";
+  }
+
+  return success();
+}
+
+void ProcessIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                           ClusterOp mesh) {
+  build(odsBuilder, odsState,
+        SmallVector<Type>(mesh.getRank(), odsBuilder.getIndexType()),
+        mesh.getSymName(), MeshAxesAttr());
+}
+
+void ProcessIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                           StringRef mesh, ArrayRef<MeshAxis> axes) {
+  build(odsBuilder, odsState,
+        SmallVector<Type>(axes.size(), odsBuilder.getIndexType()), mesh,
+        MeshAxesAttr::get(odsBuilder.getContext(), axes));
+}
+
 //===----------------------------------------------------------------------===//
 // collective communication ops
 //===----------------------------------------------------------------------===//
@@ -258,56 +415,6 @@ static LogicalResult verifyInGroupDevice(Location loc, StringRef deviceName,
   return success();
 }
 
-static FailureOr<ClusterOp> getMesh(Operation *op, FlatSymbolRefAttr meshSymbol,
-                                    SymbolTableCollection &symbolTable) {
-  mesh::ClusterOp mesh =
-      symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(op, meshSymbol);
-  if (!mesh) {
-    return op->emitError() << "Undefined required mesh symbol \""
-                           << meshSymbol.getValue() << "\".";
-  }
-
-  return mesh;
-}
-
-template <typename It>
-bool isUnique(It begin, It end) {
-  if (begin == end) {
-    return true;
-  }
-  It next = std::next(begin);
-  if (next == end) {
-    return true;
-  }
-  for (; next != end; ++next, ++begin) {
-    if (*begin == *next) {
-      return false;
-    }
-  }
-  return true;
-}
-
-static LogicalResult verifyMeshAxes(Location loc, ArrayRef<MeshAxis> axes,
-                                    ClusterOp mesh) {
-  SmallVector<MeshAxis> sorted = llvm::to_vector(axes);
-  llvm::sort(sorted);
-  if (!isUnique(sorted.begin(), sorted.end())) {
-    return emitError(loc) << "Mesh axes contains duplicate elements.";
-  }
-
-  MeshAxis rank = mesh.getRank();
-  for (auto axis : axes) {
-    if (axis >= rank || axis < 0) {
-      return emitError(loc)
-             << "0-based mesh axis index " << axis
-             << " is out of bounds. The referenced mesh \"" << mesh.getSymName()
-             << "\" is of rank " << rank << ".";
-    }
-  }
-
-  return success();
-}
-
 template <typename Op>
 static FailureOr<ClusterOp>
 getMeshAndVerifyAxes(Op op, SymbolTableCollection &symbolTable) {
diff --git a/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt
index 044b8672c8c60..7a70c047ec9dc 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIRMeshTransforms
   Simplifications.cpp
   ShardingPropagation.cpp
+  Spmdization.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Mesh
@@ -11,11 +12,13 @@ add_mlir_dialect_library(MLIRMeshTransforms
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
+  MLIRControlFlowDialect
   MLIRFuncDialect
   MLIRIR
   MLIRMeshDialect
   MLIRPass
   MLIRShardingInterface
   MLIRSupport
+  MLIRTensorDialect
   MLIRTosaShardingInterfaceImpl
 )
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
new file mode 100644
index 0000000000000..de8b3a98df998
--- /dev/null
+++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
@@ -0,0 +1,639 @@
+//===- Spmdization.cpp --------------------------------------------- C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Mesh/Transforms/Spmdization.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/Mesh/IR/MeshOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/ADL.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+
+namespace mlir {
+namespace mesh {
+
+int64_t shardDimension(int64_t dim, int64_t shardCount) {
+  if (ShapedType::isDynamic(dim) || ShapedType::isDynamic(shardCount))
+    return ShapedType::kDynamic;
+
+  assert(dim % shardCount == 0);
+  return ceilDiv(dim, shardCount);
+}
+
+int64_t unshardDimension(int64_t dim, int64_t shardCount) {
+  if (ShapedType::isDynamic(dim) || ShapedType::isDynamic(shardCount))
+    return ShapedType::kDynamic;
+
+  return dim * shardCount;
+}
+
+template <typename MeshShape, typename SplitAxes>
+int64_t shardCount(const MeshShape &meshShape, const SplitAxes &splitAxes) {
+  int64_t res = 1;
+  for (auto splitAxis : splitAxes) {
+    int64_t meshDimSize = meshShape[splitAxis];
+    if (ShapedType::isDynamic(meshDimSize)) {
+      return ShapedType::kDynamic;
+    }
+    res *= meshDimSize;
+  }
+  return res;
+}
+
+// Compute the shape for the tensor on each device in the mesh.
+// Example:
+// On a 2x4x? mesh with split axes = [[0], [1], [2]] the shape ?x5x1
+// would result in a shape for each shard of ?x2x?.
+template <typename InShape, typename MeshShape, typename SplitAxes,
+          typename OutShape>
+static void shardShape(const InShape &inShape, const MeshShape &meshShape,
+                       const SplitAxes &splitAxes, OutShape &outShape) {
+  std::copy(llvm::adl_begin(inShape), llvm::adl_end(inShape),
+            llvm::adl_begin(outShape));
+  for (auto [tensorAxis, innerSplitAxes] : llvm::enumerate(splitAxes)) {
+    outShape[tensorAxis] =
+        shardDimension(inShape[tensorAxis],
+                       shardCount(meshShape, innerSplitAxes.asArrayRef()));
+  }
+}
+
+ShapedType shardShapedType(ShapedType shape, ClusterOp mesh,
+                           MeshShardingAttr sharding) {
+  using Dim = std::decay_t<decltype(shape.getDimSize(0))>;
+  SmallVector<Dim> resShapeArr(shape.getShape().size());
+  shardShape(shape.getShape(), mesh.canonicalDimSizes(),
+             sharding.getSplitAxes(), resShapeArr);
+  return shape.clone(resShapeArr);
+}
+
+template <typename SourceAxes, typename TargetAxes>
+static bool arePartialAxesCompatible(const SourceAxes &sourceAxes,
+                                     const TargetAxes &targetAxes) {
+  return llvm::all_of(targetAxes, [&sourceAxes](auto &targetAxis) {
+    return sourceAxes.contains(targetAxis);
+  });
+}
+
+// Return the reduced value and its corresponding sharding.
+// Example:
+// sourceSharding = <@mesh_1d, [[0]], partial = sum[0]>
+// targetSharding = <@mesh_1d, [[]]>
+// Then will apply all-reduce on the source value
+// and return it with the sharding <@mesh_1d, [[0]]>.
+static std::tuple<TypedValue<ShapedType>, MeshShardingAttr>
+handlePartialAxesDuringResharding(OpBuilder &builder,
+                                  MeshShardingAttr sourceSharding,
+                                  MeshShardingAttr targetSharding,
+                                  TypedValue<ShapedType> sourceShard) {
+  if (sourceSharding.getPartialAxes().empty() &&
+      targetSharding.getPartialAxes().empty()) {
+    return {sourceShard, sourceSharding};
+  }
+  assert(targetSharding.getPartialAxes().empty() ||
+         (!sourceSharding.getPartialAxes().empty() &&
+          sourceSharding.getPartialType() == targetSharding.getPartialType()));
+  using Axis = std::decay_t<decltype(sourceSharding.getPartialAxes().front())>;
+  using AxisSet = llvm::SmallDenseSet<Axis>;
+  AxisSet sourceShardingPartialAxesSet(sourceSharding.getPartialAxes().begin(),
+                                       sourceSharding.getPartialAxes().end());
+  AxisSet targetShardingPartialAxesSet(targetSharding.getPartialAxes().begin(),
+                                       targetSharding.getPartialAxes().end());
+  assert(arePartialAxesCompatible(sourceShardingPartialAxesSet,
+                                  targetShardingPartialAxesSet));
+  llvm::SmallVector<MeshAxis> allReduceMeshAxes;
+  llvm::copy_if(sourceShardingPartialAxesSet,
+                std::back_inserter(allReduceMeshAxes),
+                [&targetShardingPartialAxesSet](Axis a) {
+                  return !targetShardingPartialAxesSet.contains(a);
+                });
+  if (allReduceMeshAxes.empty()) {
+    return {sourceShard, sourceSharding};
+  }
+
+  builder.setInsertionPointAfterValue(sourceShard);
+  TypedValue<ShapedType> resultValue =
+      builder
+          .create<AllReduceOp>(sourceShard.getLoc(), sourceShard.getType(),
+                               sourceSharding.getCluster().getLeafReference(),
+                               allReduceMeshAxes, sourceShard,
+                               sourceSharding.getPartialType())
+          .getResult()
+          .cast<TypedValue<ShapedType>>();
+
+  llvm::SmallVector<int32_t> remainingPartialAxes;
+  llvm::copy_if(sourceShardingPartialAxesSet,
+                std::back_inserter(allReduceMeshAxes),
+                [&targetShardingPartialAxesSet](Axis a) {
+                  return targetShardingPartialAxesSet.contains(a);
+                });
+  MeshShardingAttr resultSharding =
+      MeshShardingAttr::get(builder.getContext(), sourceSharding.getCluster(),
+                            sourceSharding.getSplitAxes(), remainingPartialAxes,
+                            sourceSharding.getPartialType());
+  return {resultValue, resultSharding};
+}
+
+static MeshShardingAttr
+targetShardingInSplitLastAxis(MLIRContext *ctx, MeshShardingAttr sourceSharding,
+                              int64_t splitTensorAxis, MeshAxis splitMeshAxis) {
+  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+      llvm::to_vector(sourceSharding.getSplitAxes());
+  while (static_cast<int64_t>(targetShardingSplitAxes.size()) <=
+         splitTensorAxis) {
+    targetShardingSplitAxes.push_back(DenseI32ArrayAttr::get(ctx, {}));
+  }
+  auto targetSplitAxes =
+      llvm::to_vector(targetShardingSplitAxes[splitTensorAxis].asArrayRef());
+  targetSplitAxes.push_back(splitMeshAxis);
+  targetShardingSplitAxes[splitTensorAxis] =
+      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+  return MeshShardingAttr::get(
+      ctx, sourceSharding.getCluster(), targetShardingSplitAxes,
+      sourceSharding.getPartialAxes(), sourceSharding.getPartialType());
+}
+
+static ShapedType targetShapeInSplitLastAxis(ShapedType sourceShape,
+                                             int64_t splitTensorAxis,
+                                             int64_t splitCount) {
+  SmallVector<int64_t> targetShape = llvm::to_vector(sourceShape.getShape());
+  targetShape[splitTensorAxis] =
+      shardDimension(targetShape[splitTensorAxis], splitCount);
+  return sourceShape.cloneWith(targetShape, sourceShape.getElementType());
+}
+
+// Split a replicated tensor along a mesh axis.
+// e.g. [[0, 1]] -> [[0, 1, 2]].
+// Returns the spmdized target value with its sharding.
+//
+// The implementation is the extract the tensor slice corresponding
+// to the current device.
+static std::tuple<TypedValue<ShapedType>, MeshShardingAttr>
+splitLastAxisInResharding(ImplicitLocOpBuilder &builder,
+                          MeshShardingAttr sourceSharding,
+                          TypedValue<ShapedType> sourceShard, ClusterOp mesh,
+                          int64_t splitTensorAxis, MeshAxis splitMeshAxis) {
+  MLIRContext *ctx = builder.getContext();
+  builder.setInsertionPointAfterValue(sourceShard);
+
+  Value zero = builder.create<arith::ConstantOp>(builder.getIndexAttr(0));
+
+  Value processIndexAlongAxis =
+      builder
+          .create<ProcessIndexOp>(mesh.getSymName(),
+                                  SmallVector<MeshAxis>({splitMeshAxis}))
+          .getResult()[0];
+
+  MeshShardingAttr targetSharding = targetShardingInSplitLastAxis(
+      ctx, sourceSharding, splitTensorAxis, splitMeshAxis);
+  ShapedType targetShape =
+      targetShapeInSplitLastAxis(sourceShard.getType(), splitTensorAxis,
+                                 mesh.canonicalDimSizes()[splitMeshAxis]);
+
+  Value meshAxisSize =
+      builder
+          .create<ClusterShapeOp>(mesh.getSymName(),
+                                  SmallVector<MeshAxis>({splitMeshAxis}))
+          .getResult()[0];
+
+  Value sourceAxisSize =
+      builder.create<tensor::DimOp>(sourceShard, splitTensorAxis);
+  Value sourceAxisSizeModMeshAxisSize =
+      builder.create<arith::RemUIOp>(sourceAxisSize, meshAxisSize);
+  Value isTargetShapeExactlyDivisible = builder.create<arith::CmpIOp>(
+      arith::CmpIPredicate::eq, sourceAxisSizeModMeshAxisSize, zero);
+  builder.create<cf::AssertOp>(
+      isTargetShapeExactlyDivisible,
+      "Sharding a tensor with axis size that is not exactly divisible by the "
+      "mesh axis size is not supported.");
+  Value targetAxisSize =
+      builder.create<arith::DivUIOp>(sourceAxisSize, meshAxisSize);
+  Value axisOffset =
+      builder.create<arith::MulIOp>(targetAxisSize, processIndexAlongAxis);
+  SmallVector<int64_t> staticOffsets(targetShape.getRank(), 0);
+  staticOffsets[splitTensorAxis] = ShapedType::kDynamic;
+  DenseI64ArrayAttr staticOffsetsAttr =
+      DenseI64ArrayAttr::get(ctx, staticOffsets);
+  SmallVector<Value> dynamicOffsets(1, axisOffset);
+
+  DenseI64ArrayAttr staticSizesAttr =
+      DenseI64ArrayAttr::get(ctx, targetShape.getShape());
+  SmallVector<Value> dynamicSizes;
+  for (int64_t i = 0; i < targetShape.getRank(); ++i) {
+    if (ShapedType::isDynamic(staticSizesAttr.asArrayRef()[i])) {
+      if (i == splitTensorAxis) {
+        dynamicSizes.push_back(targetAxisSize);
+      } else {
+        Value dimSize = builder.create<tensor::DimOp>(sourceShard, i);
+        dynamicSizes.push_back(dimSize);
+      }
+    }
+  }
+
+  DenseI64ArrayAttr staticStridesAttr = DenseI64ArrayAttr::get(
+      ctx, SmallVector<int64_t>(targetShape.getRank(), 1));
+  TypedValue<RankedTensorType> targetShard =
+      builder
+          .create<tensor::ExtractSliceOp>(
+              targetShape, sourceShard, dynamicOffsets, dynamicSizes,
+              SmallVector<Value>({}), staticOffsetsAttr, staticSizesAttr,
+              staticStridesAttr)
+          .getResult();
+  return {targetShard.cast<TypedValue<ShapedType>>(), targetSharding};
+}
+
+// Detect if the resharding is of type e.g.
+// [[0, 1]] -> [[0, 1, 2]].
+// If detected, returns the corresponding tensor axis mesh axis pair.
+// Does not detect insertions like
+// [[0, 1]] -> [[0, 2, 1]].
+static std::optional<std::tuple<int64_t, MeshAxis>>
+detectSplitLastAxisInResharding(MeshShardingAttr sourceSharding,
+                                MeshShardingAttr targetSharding) {
+  for (size_t tensorAxis = 0; tensorAxis < targetSharding.getSplitAxes().size();
+       ++tensorAxis) {
+    if (sourceSharding.getSplitAxes().size() > tensorAxis) {
+      if (sourceSharding.getSplitAxes()[tensorAxis].size() + 1 !=
+          targetSharding.getSplitAxes()[tensorAxis].size()) {
+        continue;
+      }
+      if (!llvm::equal(
+              sourceSharding.getSplitAxes()[tensorAxis].asArrayRef(),
+              llvm::make_range(
+                  targetSharding.getSplitAxes()[tensorAxis]
+                      .asArrayRef()
+                      .begin(),
+                  targetSharding.getSplitAxes()[tensorAxis].asArrayRef().end() -
+                      1))) {
+        continue;
+      }
+    } else {
+      if (targetSharding.getSplitAxes()[tensorAxis].size() != 1) {
+        continue;
+      }
+    }
+    return std::make_tuple(
+        tensorAxis,
+        targetSharding.getSplitAxes()[tensorAxis].asArrayRef().back());
+  }
+  return std::nullopt;
+}
+
+static std::optional<std::tuple<TypedValue<ShapedType>, MeshShardingAttr>>
+trySplitLastAxisInResharding(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                             MeshShardingAttr sourceSharding,
+                             MeshShardingAttr targetSharding,
+                             TypedValue<ShapedType> sourceShard) {
+  if (auto detectRes =
+          detectSplitLastAxisInResharding(sourceSharding, targetSharding)) {
+    auto [tensorAxis, meshAxis] = detectRes.value();
+    return splitLastAxisInResharding(builder, sourceSharding, sourceShard, mesh,
+                                     tensorAxis, meshAxis);
+  }
+
+  return std::nullopt;
+}
+
+// Detect if the resharding is of type e.g.
+// [[0, 1, 2]] -> [[0, 1]].
+// If detected, returns the corresponding tensor axis mesh axis pair.
+static std::optional<std::tuple<int64_t, MeshAxis>>
+detectUnsplitLastAxisInResharding(MeshShardingAttr sourceSharding,
+                                  MeshShardingAttr targetSharding) {
+  for (size_t tensorAxis = 0; tensorAxis < sourceSharding.getSplitAxes().size();
+       ++tensorAxis) {
+    if (targetSharding.getSplitAxes().size() > tensorAxis) {
+      if (sourceSharding.getSplitAxes()[tensorAxis].size() !=
+          targetSharding.getSplitAxes()[tensorAxis].size() + 1)
+        continue;
+      if (!llvm::equal(
+              llvm::make_range(
+                  sourceSharding.getSplitAxes()[tensorAxis]
+                      .asArrayRef()
+                      .begin(),
+                  sourceSharding.getSplitAxes()[tensorAxis].asArrayRef().end() -
+                      1),
+              targetSharding.getSplitAxes()[tensorAxis].asArrayRef()))
+        continue;
+    } else {
+      if (sourceSharding.getSplitAxes()[tensorAxis].size() != 1)
+        continue;
+    }
+    return std::make_tuple(
+        tensorAxis,
+        sourceSharding.getSplitAxes()[tensorAxis].asArrayRef().back());
+  }
+  return std::nullopt;
+}
+
+static MeshShardingAttr
+targetShardingInUnsplitLastAxis(MLIRContext *ctx,
+                                MeshShardingAttr sourceSharding,
+                                int64_t splitTensorAxis) {
+  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+      llvm::to_vector(sourceSharding.getSplitAxes());
+  assert(static_cast<int64_t>(targetShardingSplitAxes.size()) >
+         splitTensorAxis);
+  auto targetSplitAxes =
+      llvm::to_vector(targetShardingSplitAxes[splitTensorAxis].asArrayRef());
+
+  targetSplitAxes.pop_back();
+  targetShardingSplitAxes[splitTensorAxis] =
+      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+  return MeshShardingAttr::get(
+      ctx, sourceSharding.getCluster(), targetShardingSplitAxes,
+      sourceSharding.getPartialAxes(), sourceSharding.getPartialType());
+}
+
+static ShapedType allGatherResultShapeInUnsplitLastAxis(
+    ShapedType sourceShape, int64_t splitCount, int64_t splitTensorAxis) {
+  SmallVector<int64_t> targetShape = llvm::to_vector(sourceShape.getShape());
+  targetShape[splitTensorAxis] =
+      unshardDimension(targetShape[splitTensorAxis], splitCount);
+  return sourceShape.cloneWith(targetShape, sourceShape.getElementType());
+}
+
+static std::tuple<TypedValue<ShapedType>, MeshShardingAttr>
+unsplitLastAxisInResharding(ImplicitLocOpBuilder &builder,
+                            MeshShardingAttr sourceSharding,
+                            ShapedType sourceUnshardedShape,
+                            TypedValue<ShapedType> sourceShard, ClusterOp mesh,
+                            int64_t splitTensorAxis, MeshAxis splitMeshAxis) {
+  MLIRContext *ctx = builder.getContext();
+  builder.setInsertionPointAfterValue(sourceShard);
+
+  MeshShardingAttr targetSharding =
+      targetShardingInUnsplitLastAxis(ctx, sourceSharding, splitMeshAxis);
+  ShapedType allGatherResultShape = allGatherResultShapeInUnsplitLastAxis(
+      sourceShard.getType(), mesh.canonicalDimSizes()[splitMeshAxis],
+      splitTensorAxis);
+  Value allGatherResult = builder.create<AllGatherOp>(
+      RankedTensorType::get(allGatherResultShape.getShape(),
+                            allGatherResultShape.getElementType()),
+      mesh.getSymName(), SmallVector<MeshAxis>({splitMeshAxis}), sourceShard,
+      APInt(64, splitTensorAxis));
+  ShapedType targetShape =
+      shardShapedType(sourceUnshardedShape, mesh, targetSharding);
+  TypedValue<ShapedType> targetShard =
+      builder.create<tensor::CastOp>(targetShape, allGatherResult)
+          .getResult()
+          .cast<TypedValue<ShapedType>>();
+  return {targetShard, targetSharding};
+}
+
+static std::optional<std::tuple<TypedValue<ShapedType>, MeshShardingAttr>>
+tryUnsplitLastAxisInResharding(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                               MeshShardingAttr sourceSharding,
+                               MeshShardingAttr targetSharding,
+                               ShapedType sourceUnshardedShape,
+                               TypedValue<ShapedType> sourceShard) {
+  if (auto detectRes =
+          detectUnsplitLastAxisInResharding(sourceSharding, targetSharding)) {
+    auto [tensorAxis, meshAxis] = detectRes.value();
+    return unsplitLastAxisInResharding(builder, sourceSharding,
+                                       sourceUnshardedShape, sourceShard, mesh,
+                                       tensorAxis, meshAxis);
+  }
+
+  return std::nullopt;
+}
+
+// Detect if the resharding is of type e.g.
+// [[0, 1], [2]] -> [[0], [1, 2]].
+// Only moving the last axis counts.
+// If detected, returns the corresponding (source_tensor_axis,
+// target_tensor_axis, mesh_axis) tuple.
+static std::optional<std::tuple<int64_t, int64_t, MeshAxis>>
+detectMoveLastSplitAxisInResharding(MeshShardingAttr sourceSharding,
+                                    MeshShardingAttr targetSharding) {
+  for (size_t sourceTensorAxis = 0;
+       sourceTensorAxis < sourceSharding.getSplitAxes().size();
+       ++sourceTensorAxis) {
+    for (size_t targetTensorAxis = 0;
+         targetTensorAxis < targetSharding.getSplitAxes().size();
+         ++targetTensorAxis) {
+      if (sourceTensorAxis == targetTensorAxis)
+        continue;
+      if (sourceSharding.getSplitAxes()[sourceTensorAxis].empty() ||
+          targetSharding.getSplitAxes()[targetTensorAxis].empty() ||
+          sourceSharding.getSplitAxes()[sourceTensorAxis].asArrayRef().back() !=
+              targetSharding.getSplitAxes()[targetTensorAxis]
+                  .asArrayRef()
+                  .back())
+        continue;
+      if (!llvm::equal(
+              llvm::make_range(sourceSharding.getSplitAxes()[sourceTensorAxis]
+                                   .asArrayRef()
+                                   .begin(),
+                               sourceSharding.getSplitAxes()[sourceTensorAxis]
+                                       .asArrayRef()
+                                       .end() -
+                                   1),
+              llvm::make_range(targetSharding.getSplitAxes()[targetTensorAxis]
+                                   .asArrayRef()
+                                   .begin(),
+                               targetSharding.getSplitAxes()[targetTensorAxis]
+                                       .asArrayRef()
+                                       .end() -
+                                   1)))
+        continue;
+      return std::make_tuple(
+          sourceTensorAxis, targetTensorAxis,
+          sourceSharding.getSplitAxes()[sourceTensorAxis].asArrayRef().back());
+    }
+  }
+  return std::nullopt;
+}
+
+static MeshShardingAttr
+targetShardingInMoveLastAxis(MLIRContext *ctx, MeshShardingAttr sourceSharding,
+                             int64_t sourceTensorAxis,
+                             int64_t targetTensorAxis) {
+  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+      llvm::to_vector(sourceSharding.getSplitAxes());
+  while (static_cast<int64_t>(targetShardingSplitAxes.size()) <=
+         targetTensorAxis) {
+    targetShardingSplitAxes.push_back(DenseI32ArrayAttr::get(ctx, {}));
+  }
+
+  auto sourceSplitAxes =
+      llvm::to_vector(targetShardingSplitAxes[sourceTensorAxis].asArrayRef());
+  assert(!sourceSplitAxes.empty());
+  auto meshAxis = sourceSplitAxes.back();
+  sourceSplitAxes.pop_back();
+  targetShardingSplitAxes[sourceTensorAxis] =
+      DenseI32ArrayAttr::get(ctx, sourceSplitAxes);
+
+  auto targetSplitAxes =
+      llvm::to_vector(targetShardingSplitAxes[targetTensorAxis].asArrayRef());
+  targetSplitAxes.push_back(meshAxis);
+  targetShardingSplitAxes[targetTensorAxis] =
+      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+
+  return MeshShardingAttr::get(
+      ctx, sourceSharding.getCluster(), targetShardingSplitAxes,
+      sourceSharding.getPartialAxes(), sourceSharding.getPartialType());
+}
+
+static ShapedType allToAllResultShapeInMoveLastAxis(ShapedType sourceShape,
+                                                    int64_t splitCount,
+                                                    int64_t sourceTensorAxis,
+                                                    int64_t targetTensorAxis) {
+  SmallVector<int64_t> targetShape = llvm::to_vector(sourceShape.getShape());
+  targetShape[sourceTensorAxis] =
+      unshardDimension(targetShape[sourceTensorAxis], splitCount);
+  targetShape[targetTensorAxis] =
+      shardDimension(targetShape[targetTensorAxis], splitCount);
+  return sourceShape.cloneWith(targetShape, sourceShape.getElementType());
+}
+
+static std::tuple<TypedValue<ShapedType>, MeshShardingAttr>
+moveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                              MeshShardingAttr sourceSharding,
+                              ShapedType sourceUnshardedShape,
+                              TypedValue<ShapedType> sourceShard,
+                              int64_t sourceTensorAxis,
+                              int64_t targetTensorAxis, MeshAxis meshAxis) {
+  MLIRContext *ctx = builder.getContext();
+  builder.setInsertionPointAfterValue(sourceShard);
+
+  MeshShardingAttr targetSharding = targetShardingInMoveLastAxis(
+      ctx, sourceSharding, sourceTensorAxis, targetTensorAxis);
+  ShapedType allToAllResultShape = allToAllResultShapeInMoveLastAxis(
+      sourceShard.getType(), mesh.canonicalDimSizes()[meshAxis],
+      sourceTensorAxis, targetTensorAxis);
+  Value allToAllResult = builder.create<AllToAllOp>(
+      RankedTensorType::get(allToAllResultShape.getShape(),
+                            allToAllResultShape.getElementType()),
+      mesh.getSymName(), SmallVector<MeshAxis>({meshAxis}), sourceShard,
+      APInt(64, targetTensorAxis), APInt(64, sourceTensorAxis));
+  ShapedType targetShape =
+      shardShapedType(sourceUnshardedShape, mesh, targetSharding);
+  TypedValue<ShapedType> targetShard =
+      builder.create<tensor::CastOp>(targetShape, allToAllResult)
+          .getResult()
+          .cast<TypedValue<ShapedType>>();
+  return {targetShard, targetSharding};
+}
+
+static std::optional<std::tuple<TypedValue<ShapedType>, MeshShardingAttr>>
+tryMoveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                                 MeshShardingAttr sourceSharding,
+                                 MeshShardingAttr targetSharding,
+                                 ShapedType sourceUnshardedShape,
+                                 TypedValue<ShapedType> sourceShard) {
+  if (auto detectRes =
+          detectMoveLastSplitAxisInResharding(sourceSharding, targetSharding)) {
+    auto [sourceTensorAxis, targetTensorAxis, meshAxis] = detectRes.value();
+    return moveLastSplitAxisInResharding(
+        builder, mesh, sourceSharding, sourceUnshardedShape, sourceShard,
+        sourceTensorAxis, targetTensorAxis, meshAxis);
+  }
+
+  return std::nullopt;
+}
+
+// Handles only resharding on a 1D mesh.
+// Currently the sharded tensor axes must be exactly divisible by the single
+// mesh axis size.
+static TypedValue<ShapedType>
+reshardOn1DMesh(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                MeshShardingAttr sourceSharding,
+                MeshShardingAttr targetSharding,
+                TypedValue<ShapedType> sourceUnshardedValue,
+                TypedValue<ShapedType> sourceShard) {
+  assert(sourceShard.getType() ==
+         shardShapedType(sourceUnshardedValue.getType(), mesh, sourceSharding));
+  ShapedType targetShardType =
+      shardShapedType(sourceUnshardedValue.getType(), mesh, targetSharding);
+  assert(sourceShard.getType().getRank() == targetShardType.getRank());
+  assert(mesh.getRank() == 1 && "Only 1D meshes are currently supported.");
+
+  auto [reducedSourceShard, reducedSourceSharding] =
+      handlePartialAxesDuringResharding(builder, sourceSharding, targetSharding,
+                                        sourceShard);
+
+  if (reducedSourceSharding == targetSharding) {
+    return reducedSourceShard;
+  }
+
+  TypedValue<ShapedType> targetShard;
+  MeshShardingAttr actualTargetSharding;
+  if (auto tryRes = tryMoveLastSplitAxisInResharding(
+          builder, mesh, reducedSourceSharding, targetSharding,
+          sourceUnshardedValue.getType(), reducedSourceShard)) {
+    std::tie(targetShard, actualTargetSharding) = tryRes.value();
+  } else if (auto tryRes = trySplitLastAxisInResharding(
+                 builder, mesh, reducedSourceSharding, targetSharding,
+                 reducedSourceShard)) {
+    std::tie(targetShard, actualTargetSharding) = tryRes.value();
+  } else if (auto tryRes = tryUnsplitLastAxisInResharding(
+                 builder, mesh, reducedSourceSharding, targetSharding,
+                 sourceUnshardedValue.getType(), reducedSourceShard)) {
+    std::tie(targetShard, actualTargetSharding) = tryRes.value();
+  } else {
+    assert(false && "Did not find any pattern to apply.");
+  }
+
+  assert(actualTargetSharding == targetSharding);
+  assert(targetShard.getType() == targetShardType);
+  return targetShard;
+}
+
+TypedValue<ShapedType> reshard(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                               MeshShardingAttr sourceSharding,
+                               MeshShardingAttr targetSharding,
+                               TypedValue<ShapedType> sourceUnshardedValue,
+                               TypedValue<ShapedType> sourceShard) {
+  // Resort to handling only 1D meshes since the general case is complicated if
+  // it needs to be communication efficient in terms of minimizing the data
+  // transfered between devices.
+  return reshardOn1DMesh(builder, mesh, sourceSharding, targetSharding,
+                         sourceUnshardedValue, sourceShard);
+}
+
+TypedValue<ShapedType> reshard(OpBuilder &builder, ClusterOp mesh,
+                               ShardOp source, ShardOp target,
+                               TypedValue<ShapedType> sourceShardValue) {
+  assert(!source.getAnnotateForUsers());
+  assert(target.getAnnotateForUsers());
+  assert(source.getResult() == target.getOperand());
+  ImplicitLocOpBuilder implicitLocOpBuilder(target->getLoc(), builder);
+  return reshard(
+      implicitLocOpBuilder, mesh, source.getShard(), target.getShard(),
+      source.getSrc().cast<TypedValue<ShapedType>>(), sourceShardValue);
+}
+
+void reshardingRegisterDependentDialects(DialectRegistry &registry) {
+  registry.insert<arith::ArithDialect, mesh::MeshDialect, tensor::TensorDialect,
+                  cf::ControlFlowDialect>();
+}
+
+} // namespace mesh
+} // namespace mlir
diff --git a/mlir/test/Dialect/Mesh/invalid.mlir b/mlir/test/Dialect/Mesh/invalid.mlir
index 03994f8f011e1..3ee578a37235e 100644
--- a/mlir/test/Dialect/Mesh/invalid.mlir
+++ b/mlir/test/Dialect/Mesh/invalid.mlir
@@ -70,6 +70,102 @@ func.func @mesh_axis_negtive_in_partial(
 
 // -----
 
+mesh.cluster @mesh0(rank = 2, dim_sizes = 2x4)
+
+func.func @cluster_shape_mesh_axis_out_of_bounds() -> (index, index) {
+  // expected-error@+1 {{0-based mesh axis index 2 is out of bounds. The referenced mesh "mesh0" is of rank 2.}}
+  %0:2 = mesh.cluster_shape @mesh0 axes = [0, 2] : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 3, dim_sizes = 1x2x3)
+
+func.func @cluster_shape_duplicate_mesh_axis() -> (index, index, index) {
+  // expected-error@+1 {{Mesh axes contains duplicate elements.}}
+  %0:3 = mesh.cluster_shape @mesh0 axes = [0, 2, 0] : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 2, dim_sizes = 2x4)
+
+func.func @cluster_shape_wrong_number_of_results() -> (index, index) {
+  // expected-error@+1 {{Unexpected number of results 2. Expected 1.}}
+  %0:2 = mesh.cluster_shape @mesh0 axes = [0] : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 3, dim_sizes = 1x2x3)
+
+func.func @cluster_shape_wrong_number_of_results_empty_mesh_axes() -> (index, index) {
+  // expected-error@+1 {{Unexpected number of results 2. Expected 3.}}
+  %0:2 = mesh.cluster_shape @mesh0 : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+func.func @cluster_shape_invalid_mesh_name() -> (index) {
+  // expected-error@+1 {{Undefined required mesh symbol "this_mesh_symbol_does_not_exist".}}
+  %0 = mesh.cluster_shape @this_mesh_symbol_does_not_exist : index
+  return %0#0 : index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 2, dim_sizes = 2x4)
+
+func.func @process_index_mesh_axis_out_of_bounds() -> (index, index) {
+  // expected-error@+1 {{0-based mesh axis index 2 is out of bounds. The referenced mesh "mesh0" is of rank 2.}}
+  %0:2 = mesh.process_index on @mesh0 axes = [0, 2] : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 3, dim_sizes = 1x2x3)
+
+func.func @process_index_duplicate_mesh_axis() -> (index, index, index) {
+  // expected-error@+1 {{Mesh axes contains duplicate elements.}}
+  %0:3 = mesh.process_index on @mesh0 axes = [0, 2, 0] : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 2, dim_sizes = 2x4)
+
+func.func @process_index_wrong_number_of_results() -> (index, index) {
+  // expected-error@+1 {{Unexpected number of results 2. Expected 1.}}
+  %0:2 = mesh.process_index on @mesh0 axes = [0] : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 3, dim_sizes = 1x2x3)
+
+func.func @process_index_wrong_number_of_results_empty_mesh_axes() -> (index, index) {
+  // expected-error@+1 {{Unexpected number of results 2. Expected 3.}}
+  %0:2 = mesh.process_index on @mesh0 : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+func.func @process_index_invalid_mesh_name() -> (index) {
+  // expected-error@+1 {{Undefined required mesh symbol "this_mesh_symbol_does_not_exist".}}
+  %0 = mesh.process_index on @this_mesh_symbol_does_not_exist : index
+  return %0#0 : index
+}
+
+// -----
+
 func.func @all_reduce_invalid_mesh_symbol(
     %arg0 : tensor<4xf32>) -> tensor<4xf64> {
   // expected-error@+1 {{Undefined required mesh symbol "this_mesh_symbol_does_not_exist".}}
diff --git a/mlir/test/Dialect/Mesh/ops.mlir b/mlir/test/Dialect/Mesh/ops.mlir
index 8f8e309d18f15..a7c3b3dbab9c1 100644
--- a/mlir/test/Dialect/Mesh/ops.mlir
+++ b/mlir/test/Dialect/Mesh/ops.mlir
@@ -132,6 +132,55 @@ func.func @mesh_shard_op_two_users(%arg0 : tensor<4x8xf32>) ->
   return %1, %2 : tensor<4x8xf32>, tensor<4x8xf32>
 }
 
+// CHECK-LABEL: func @cluster_shape
+func.func @cluster_shape() -> (index, index) {
+  // CHECK: %[[RES:.*]]:2 = mesh.cluster_shape @mesh0 axes = [0, 1] : index, index
+  %0:2 = mesh.cluster_shape @mesh0 axes = [0, 1] : index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1 : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// CHECK-LABEL: func @cluster_shape_default_axes
+func.func @cluster_shape_default_axes() -> (index, index, index) {
+  // CHECK: %[[RES:.*]]:3 = mesh.cluster_shape @mesh0 : index, index, index
+  %0:3 = mesh.cluster_shape @mesh0 : index, index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// CHECK-LABEL: func @cluster_shape_empty_axes
+func.func @cluster_shape_empty_axes() -> (index, index, index) {
+  // CHECK: %[[RES:.*]]:3 = mesh.cluster_shape @mesh0 : index, index, index
+  %0:3 = mesh.cluster_shape @mesh0 axes = [] : index, index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// CHECK-LABEL: func @process_index
+func.func @process_index() -> (index, index) {
+  // CHECK: %[[RES:.*]]:2 = mesh.process_index on @mesh0 axes = [0, 1] : index, index
+  %0:2 = mesh.process_index on @mesh0 axes = [0, 1] : index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1 : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// CHECK-LABEL: func @process_index_default_axes
+func.func @process_index_default_axes() -> (index, index, index) {
+  // CHECK: %[[RES:.*]]:3 = mesh.process_index on @mesh0 : index, index, index
+  %0:3 = mesh.process_index on @mesh0 : index, index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// CHECK-LABEL: func @process_index_empty_axes
+func.func @process_index_empty_axes() -> (index, index, index) {
+  // CHECK: %[[RES:.*]]:3 = mesh.process_index on @mesh0 : index, index, index
+  %0:3 = mesh.process_index on @mesh0 axes = [] : index, index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+
 // CHECK-LABEL: func @all_reduce
 func.func @all_reduce(
     // CHECK-SAME: %[[ARG:.*]]: tensor<3x4xf32>
diff --git a/mlir/test/Dialect/Mesh/resharding-spmdization.mlir b/mlir/test/Dialect/Mesh/resharding-spmdization.mlir
new file mode 100644
index 0000000000000..0ba0d76c09a74
--- /dev/null
+++ b/mlir/test/Dialect/Mesh/resharding-spmdization.mlir
@@ -0,0 +1,154 @@
+// RUN: mlir-opt -test-mesh-resharding-spmdization %s | FileCheck %s
+
+mesh.cluster @mesh_1d(rank = 1, dim_sizes = 2)
+mesh.cluster @mesh_1d_dynamic(rank = 1, dim_sizes = ?)
+
+// CHECK-LABEL: func @same_source_and_target_sharding
+func.func @same_source_and_target_sharding(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<2xf32>
+  %arg0: tensor<2xf32>
+) -> tensor<2xf32> {
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[]]> : tensor<2xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[]]> annotate_for_users : tensor<2xf32>
+  // CHECK: return %[[ARG]]
+  return %1 : tensor<2xf32>
+}
+
+// CHECK-LABEL: func @split_replicated_tensor_axis
+func.func @split_replicated_tensor_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<3x14xf32>
+  %arg0: tensor<3x14xf32>
+) -> tensor<3x14xf32> {
+  // CHECK: %[[ZERO:.*]] = arith.constant 0 : index
+  // CHECK: %[[TENSOR_SPLIT_AXIS_SIZE:.*]] = arith.constant 14 : index
+  // CHECK: %[[PROCESS_INDEX:.*]] = mesh.process_index on @mesh_1d axes = [0] : index
+  // CHECK: %[[MESH_AXIS_SIZE:.*]] = mesh.cluster_shape @mesh_1d axes = [0] : index
+  // CHECK: %[[TENSOR_SPLIT_AXIS_SIZE_MOD_MESH_AXIS_SIZE:.*]] = arith.remui %[[TENSOR_SPLIT_AXIS_SIZE]], %[[MESH_AXIS_SIZE]] : index
+  // CHECK: %[[RESULT_TENSOR_AXIS_SIZE_CHECK:.*]] = arith.cmpi eq, %[[TENSOR_SPLIT_AXIS_SIZE_MOD_MESH_AXIS_SIZE]], %[[ZERO]] : index
+  // CHECK: cf.assert %[[RESULT_TENSOR_AXIS_SIZE_CHECK]]
+  // CHECK: %[[RESULT_TENSOR_AXIS_SIZE:.*]] = arith.divui %[[TENSOR_SPLIT_AXIS_SIZE]], %[[MESH_AXIS_SIZE]] : index
+  // CHECK: %[[RESULT_TENSOR_AXIS_OFFSET:.*]] = arith.muli %[[RESULT_TENSOR_AXIS_SIZE]], %[[PROCESS_INDEX]] : index
+  // CHECK: %[[RESULT_TENSOR_SLICE:.*]] = tensor.extract_slice %[[ARG]][0, %[[RESULT_TENSOR_AXIS_OFFSET]]] [3, 7] [1, 1] : tensor<3x14xf32> to tensor<3x7xf32>
+  // CHECK: %[[RESULT:.*]] = builtin.unrealized_conversion_cast %[[RESULT_TENSOR_SLICE]] : tensor<3x7xf32> to tensor<3x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[]]> : tensor<3x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[], [0]]> annotate_for_users : tensor<3x14xf32>
+  // CHECK: return %[[RESULT]] : tensor<3x14xf32>
+  return %1 : tensor<3x14xf32>
+}
+
+// CHECK-LABEL: func @split_replicated_tensor_axis_dynamic
+func.func @split_replicated_tensor_axis_dynamic(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<?x3x?xf32>
+  %arg0: tensor<?x3x?xf32>
+) -> tensor<?x3x?xf32> {
+  // CHECK: %[[ZERO:.*]] = arith.constant 0 : index
+  // CHECK: %[[TWO:.*]] = arith.constant 2 : index
+  // CHECK: %[[PROCESS_INDEX:.*]] = mesh.process_index on @mesh_1d_dynamic axes = [0] : index
+  // CHECK: %[[MESH_AXIS_SIZE:.*]] = mesh.cluster_shape @mesh_1d_dynamic axes = [0] : index
+  // CHECK: %[[TENSOR_SPLIT_AXIS_SIZE:.*]] = tensor.dim %[[ARG]], %[[ZERO]] : tensor<?x3x?xf32>
+  // CHECK: %[[TENSOR_SPLIT_AXIS_SIZE_MOD_MESH_AXIS_SIZE:.*]] = arith.remui %[[TENSOR_SPLIT_AXIS_SIZE]], %[[MESH_AXIS_SIZE]] : index
+  // CHECK: %[[RESULT_TENSOR_AXIS_SIZE_CHECK:.*]] = arith.cmpi eq, %[[TENSOR_SPLIT_AXIS_SIZE_MOD_MESH_AXIS_SIZE]], %[[ZERO]] : index
+  // CHECK: cf.assert %[[RESULT_TENSOR_AXIS_SIZE_CHECK]]
+  // CHECK: %[[RESULT_TENSOR_SPLIT_AXIS_SIZE:.*]] = arith.divui %[[TENSOR_SPLIT_AXIS_SIZE]], %[[MESH_AXIS_SIZE]] : index
+  // CHECK: %[[RESULT_TENSOR_SPLIT_AXIS_OFFSET:.*]] = arith.muli %[[RESULT_TENSOR_SPLIT_AXIS_SIZE]], %[[PROCESS_INDEX]] : index
+  // CHECK: %[[TENSOR_AXIS_2_SIZE:.*]] = tensor.dim %[[ARG]], %[[TWO]] : tensor<?x3x?xf32>
+  // CHECK: %[[RESULT_TENSOR_SLICE:.*]] = tensor.extract_slice %[[ARG]][%[[RESULT_TENSOR_SPLIT_AXIS_OFFSET]], 0, 0]
+  // CHECK-SAME: [%[[RESULT_TENSOR_SPLIT_AXIS_SIZE]], 3, %[[TENSOR_AXIS_2_SIZE]]] [1, 1, 1] : tensor<?x3x?xf32> to tensor<?x3x?xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d_dynamic, [[], [], []]> : tensor<?x3x?xf32>
+  %1 = mesh.shard %0 to <@mesh_1d_dynamic, [[0]]> annotate_for_users : tensor<?x3x?xf32>
+  // CHECK: return %[[RESULT_TENSOR_SLICE]] : tensor<?x3x?xf32>
+  return %1 : tensor<?x3x?xf32>
+}
+
+// CHECK-LABEL: func @move_split_axis
+func.func @move_split_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[SOURCE_SHARD:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : tensor<10x14xf32> to tensor<5x14xf32>
+  // CHECK: %[[TARGET_SHARD:.*]] = mesh.all_to_all %[[SOURCE_SHARD]] on @mesh_1d mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<5x14xf32> -> tensor<10x7xf32>
+  // CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[TARGET_SHARD]] : tensor<10x7xf32> to tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[], [0]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: return %[[RES]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
+
+// CHECK-LABEL: func @move_split_axis_dynamic_mesh
+func.func @move_split_axis_dynamic_mesh(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[SOURCE_SHARD:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : tensor<10x14xf32> to tensor<?x14xf32>
+  // CHECK: %[[ALL_TO_ALL:.*]] = mesh.all_to_all %[[SOURCE_SHARD]] on @mesh_1d_dynamic mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<?x14xf32> -> tensor<?x?xf32>
+  // CHECK: %[[TARGET_SHARD:.*]] = tensor.cast %[[ALL_TO_ALL]] : tensor<?x?xf32> to tensor<10x?xf32>
+  // CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[TARGET_SHARD]] : tensor<10x?xf32> to tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d_dynamic, [[0]]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d_dynamic, [[], [0]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: return %[[RES]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
+
+// CHECK-LABEL: func @move_split_dynamic_axis
+func.func @move_split_dynamic_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<?x14xf32>
+  %arg0: tensor<?x14xf32>
+) -> tensor<?x14xf32> {
+  // CHECK: %[[TARGET_SHARD:.*]] = mesh.all_to_all %[[ARG]] on @mesh_1d mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<?x14xf32> -> tensor<?x7xf32>
+  // CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[TARGET_SHARD]] : tensor<?x7xf32> to tensor<?x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<?x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[], [0]]> annotate_for_users : tensor<?x14xf32>
+  // CHECK: return %[[RES]] : tensor<?x14xf32>
+  return %1 : tensor<?x14xf32>
+}
+
+// CHECK-LABEL: func @unshard_static_axis
+func.func @unshard_static_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[SOURCE_SHARD:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : tensor<10x14xf32> to tensor<5x14xf32>
+  // CHECK: %[[ALL_GATHER:.*]] = mesh.all_gather %[[SOURCE_SHARD]] on @mesh_1d mesh_axes = [0] gather_axis = 0 : tensor<5x14xf32> -> tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: return %[[ALL_GATHER]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
+
+// CHECK-LABEL: func @unshard_dynamic_axis
+func.func @unshard_dynamic_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<?x14xf32>
+  %arg0: tensor<?x14xf32>
+) -> tensor<?x14xf32> {
+  // CHECK: %[[ALL_GATHER:.*]] = mesh.all_gather %[[ARG]] on @mesh_1d mesh_axes = [0] gather_axis = 0 : tensor<?x14xf32> -> tensor<?x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<?x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[]]> annotate_for_users : tensor<?x14xf32>
+  // CHECK: return %[[ALL_GATHER]] : tensor<?x14xf32>
+  return %1 : tensor<?x14xf32>
+}
+
+// CHECK-LABEL: func @unshard_static_axis_on_dynamic_mesh_axis
+func.func @unshard_static_axis_on_dynamic_mesh_axis(
+// CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>  
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[SOURCE_SHARD:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : tensor<10x14xf32> to tensor<?x14xf32>
+  // CHECK: %[[ALL_GATHER:.*]] = mesh.all_gather %[[SOURCE_SHARD]] on @mesh_1d_dynamic mesh_axes = [0] gather_axis = 0 : tensor<?x14xf32> -> tensor<?x14xf32>
+  // CHECK: %[[RES:.*]] = tensor.cast %[[ALL_GATHER]] : tensor<?x14xf32> to tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d_dynamic, [[0]]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d_dynamic, [[]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: return %[[RES]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
+
+// CHECK-LABEL: func @partial_axis
+func.func @partial_axis(
+// CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>  
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[ALL_REDUCE:.*]] = mesh.all_reduce %[[ARG]] on @mesh_1d mesh_axes = [0] : tensor<10x14xf32> -> tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[]], partial = sum[0]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: %[[ALL_REDUCE]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
diff --git a/mlir/test/lib/Dialect/Mesh/CMakeLists.txt b/mlir/test/lib/Dialect/Mesh/CMakeLists.txt
index 16b50bb878a07..f14d282857a1e 100644
--- a/mlir/test/lib/Dialect/Mesh/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Mesh/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRMeshTestSimplifications
+  TestReshardingSpmdization.cpp
   TestSimplifications.cpp
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp b/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp
new file mode 100644
index 0000000000000..6fecbd48f1538
--- /dev/null
+++ b/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp
@@ -0,0 +1,122 @@
+//===- TestSimplification.cpp - Test simplification -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Mesh/IR/MeshOps.h"
+#include "mlir/Dialect/Mesh/Transforms/Spmdization.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::mesh;
+
+namespace {
+
+struct TestMeshReshardingRewritePattern : OpRewritePattern<ShardOp> {
+  using OpRewritePattern<ShardOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ShardOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getAnnotateForUsers()) {
+      return failure();
+    }
+
+    SymbolTableCollection symbolTable;
+    mesh::ClusterOp mesh = symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(
+        op, op.getShard().getCluster());
+
+    bool foundUser = false;
+    for (auto user : op->getUsers()) {
+      if (auto targetShardOp = llvm::dyn_cast<ShardOp>(user)) {
+        if (targetShardOp.getAnnotateForUsers() &&
+            mesh == symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(
+                        targetShardOp, targetShardOp.getShard().getCluster())) {
+          foundUser = true;
+          break;
+        }
+      }
+    }
+
+    if (!foundUser) {
+      return failure();
+    }
+
+    for (auto user : op->getUsers()) {
+      auto targetShardOp = llvm::dyn_cast<ShardOp>(user);
+      if (!targetShardOp || !targetShardOp.getAnnotateForUsers() ||
+          symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(
+              targetShardOp, targetShardOp.getShard().getCluster()) != mesh) {
+        continue;
+      }
+
+      ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+      ShapedType sourceShardShape =
+          shardShapedType(op.getResult().getType(), mesh, op.getShard());
+      TypedValue<ShapedType> sourceShard =
+          builder
+              .create<UnrealizedConversionCastOp>(sourceShardShape,
+                                                  op.getOperand())
+              ->getResult(0)
+              .cast<TypedValue<ShapedType>>();
+      TypedValue<ShapedType> targetShard =
+          reshard(builder, mesh, op, targetShardOp, sourceShard);
+      Value newTargetUnsharded =
+          builder
+              .create<UnrealizedConversionCastOp>(
+                  targetShardOp.getResult().getType(), targetShard)
+              ->getResult(0);
+      rewriter.replaceAllUsesWith(targetShardOp.getResult(),
+                                  newTargetUnsharded);
+    }
+
+    return success();
+  }
+};
+
+struct TestMeshReshardingPass
+    : public PassWrapper<TestMeshReshardingPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMeshReshardingPass)
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.insert<TestMeshReshardingRewritePattern>(&getContext());
+    if (failed(applyPatternsAndFoldGreedily(getOperation().getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    reshardingRegisterDependentDialects(registry);
+    registry.insert<BuiltinDialect>();
+  }
+  StringRef getArgument() const final {
+    return "test-mesh-resharding-spmdization";
+  }
+  StringRef getDescription() const final {
+    return "Test Mesh dialect resharding spmdization.";
+  }
+};
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestMeshReshardingSpmdizationPass() {
+  PassRegistration<TestMeshReshardingPass>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index dc4121dc46bb9..f7a5b3183b50b 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -119,6 +119,7 @@ void registerTestMathPolynomialApproximationPass();
 void registerTestMemRefDependenceCheck();
 void registerTestMemRefStrideCalculation();
 void registerTestMeshSimplificationsPass();
+void registerTestMeshReshardingSpmdizationPass();
 void registerTestNextAccessPass();
 void registerTestOneToNTypeConversionPass();
 void registerTestOpaqueLoc();
@@ -237,6 +238,7 @@ void registerTestPasses() {
   mlir::test::registerTestMemRefDependenceCheck();
   mlir::test::registerTestMemRefStrideCalculation();
   mlir::test::registerTestMeshSimplificationsPass();
+  mlir::test::registerTestMeshReshardingSpmdizationPass();
   mlir::test::registerTestNextAccessPass();
   mlir::test::registerTestOneToNTypeConversionPass();
   mlir::test::registerTestOpaqueLoc();

From d933b88b71b00461815667d7cd0bb2fecd8606db Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Tue, 2 Jan 2024 15:56:42 -0800
Subject: [PATCH 090/313] =?UTF-8?q?[mlir][sparse]=20use=20a=20common=20uti?=
 =?UTF-8?q?l=20function=20to=20query=20the=20tensor=20level=20s=E2=80=A6?=
 =?UTF-8?q?=20(#76764)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…et in a lattice point.
---
 .../Transforms/Sparsification.cpp             | 180 +++++++++---------
 1 file changed, 86 insertions(+), 94 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index 934e1e559f44d..35eb4b4f6e47f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -949,94 +949,9 @@ static void endIf(CodegenEnv &env, OpBuilder &builder, scf::IfOp ifOp,
 // Sparsifier synthesis methods (loop sequence).
 //===----------------------------------------------------------------------===//
 
-/// Starts a loop sequence at given level. Returns true if
-/// the universal loop index must be maintained at this level.
-static bool startLoopSeq(CodegenEnv &env, OpBuilder &builder, ExprId exp,
-                         LoopId curr, LatSetId lts) {
-  assert(!env.getLoopVar(curr));
-  // Emit invariants at this loop sequence level.
-  genInvariants(env, builder, exp, curr, /*isStart=*/true);
-  // Emit access pattern expansion for sparse tensor output.
-  genExpand(env, builder, curr, /*isStart=*/true);
-  // Emit further intitialization at this loop sequence level.
-  const LatPointId l0 = env.set(lts)[0];
-  bool needsUniv = false;
-
-  SmallVector<TensorLevel> tidLvls;
-  env.merger().foreachTensorLoopId(l0, [&](TensorLoopId b, TensorId tid,
-                                           std::optional<Level> lvl,
-                                           LevelType lt, bool isIdxReduc) {
-    assert(env.merger().loop(b) == curr);
-    if (isDenseLT(lt) || isUndefLT(lt)) {
-      if (tid == env.merger().getSynTensorID()) {
-        // Needs loop emitter to set up loop bounds for synthetic tensor too if
-        // there is a loop condition imposed on the synthetic tensor.
-        tidLvls.push_back(env.makeTensorLevel(tid, env.getCurrentDepth()));
-      }
-      needsUniv = true;
-    }
-    if (isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-        is2OutOf4LT(lt) || isIdxReduc) {
-      // Only when this is a index reduction loop, can the lt be undefined.
-      assert(!isUndefLT(lt) || isIdxReduc);
-      // sparse/singleton levels, or a dense/sparse index reduction loop.
-      tidLvls.push_back(env.makeTensorLevel(tid, *lvl));
-    }
-  });
-
-  env.emitter().enterNewLoopSeq(builder, env.op().getLoc(), tidLvls);
-
-  // Maintain the universal index only if it is actually
-  // consumed by a subsequent lattice point.
-  if (needsUniv) {
-    for (const LatPointId li : env.set(lts).drop_front())
-      if (!env.merger().hasAnySparse(env.lat(li).simple))
-        return true;
-  }
-  return false;
-}
-
-// Generates dense affine address for encoding.
-static void genConstantDenseAddressFromLevel(CodegenEnv &env,
-                                             OpBuilder &builder, TensorId tid,
-                                             Level startLvl) {
-  // TODO: Handle affine expression on output tensor.
-  linalg::GenericOp op = env.op();
-  assert(tid < op.getNumDpsInputs());
-  OpOperand *input = op.getDpsInputOperands()[tid];
-  const auto lvlExprs = op.getMatchingIndexingMap(input).getResults();
-  const auto enc = getSparseTensorEncoding(input->get().getType());
-  if (enc) {
-    const Location loc = op.getLoc();
-    const TensorId tid = env.makeTensorId(input->getOperandNumber());
-    const Level lvlRank = enc.getLvlRank();
-    assert(lvlExprs.size() == static_cast<size_t>(lvlRank));
-    for (Level l = startLvl; l < lvlRank; l++) {
-      AffineExpr lvlExpr = lvlExprs[l];
-      if (enc.isDenseLvl(l) && isa<AffineConstantExpr>(lvlExpr))
-        env.emitter().genDenseAffineAddress(
-            builder, loc, env.makeTensorLevel(tid, l), lvlExpr);
-      else
-        return; // break on first non-dense non-constant level
-    }
-  }
-}
-
-// We can generate address for constant affine expression before any loops
-// starting from the first level as they do not depend on any thing.
-// E.g., [Dense, Dense, Sparse] -> (1, 2, d0), the addresses for the first two
-// levels can be determined before loops.
-static void genInitConstantDenseAddress(CodegenEnv &env,
-                                        RewriterBase &rewriter) {
-  for (TensorId tid = 0, e = env.op().getNumDpsInputs(); tid < e; tid++)
-    genConstantDenseAddressFromLevel(env, rewriter, tid, 0);
-}
-
-/// Return true if the lattices bit can be iterated by a for loop.
-static bool translateBitsToTidLvlPairs(
+static bool getAllTidLvlsInLatPoints(
     CodegenEnv &env, LatPointId li, LoopId curr,
-    SmallVectorImpl<TensorLevel> &tidLvls,
-    SmallVectorImpl<std::pair<TensorLevel, AffineExpr>> &affineTidLvls) {
+    llvm::function_ref<void(TensorLevel, AffineExpr)> callback) {
   const BitVector &simple = env.lat(li).simple;
   const TensorId outTid = env.merger().getOutTensorID();
   const std::optional<Level> outLvl = env.merger().getLvl(outTid, curr);
@@ -1048,7 +963,7 @@ static bool translateBitsToTidLvlPairs(
                     LevelType lt, bool isIdxReduc) {
         if (simple[b]) {
           if (isIdxReduc) {
-            tidLvls.push_back(env.makeTensorLevel(tid, *lvl));
+            callback(env.makeTensorLevel(tid, *lvl), nullptr);
             numloopCond++;
             return;
           }
@@ -1072,10 +987,10 @@ static bool translateBitsToTidLvlPairs(
             }
           }
           hasNonUnique = !isUniqueLT(lt) || hasNonUnique;
-          tidLvls.push_back(env.makeTensorLevel(tid, *lvl));
+          callback(env.makeTensorLevel(tid, *lvl), nullptr);
           numloopCond++;
         } else if (isDenseLT(lt) || isIdxReduc) {
-          tidLvls.push_back(env.makeTensorLevel(tid, *lvl));
+          callback(env.makeTensorLevel(tid, *lvl), nullptr);
         } else {
           assert(isUndefLT(lt));
           linalg::GenericOp op = env.op();
@@ -1109,7 +1024,7 @@ static bool translateBitsToTidLvlPairs(
                 // level. We need to generate the address according to the
                 // affine expression. This is also the best place we can do it
                 // to avoid putting it inside inner loops.
-                affineTidLvls.emplace_back(env.makeTensorLevel(tid, l), exp);
+                callback(env.makeTensorLevel(tid, l), exp);
               }
             }
           }
@@ -1120,15 +1035,14 @@ static bool translateBitsToTidLvlPairs(
     // Note that we generate dense indices of the output tensor
     // unconditionally, since they may not appear in the lattice, but may be
     // needed for linearized env.
-    tidLvls.push_back(env.makeTensorLevel(outTid, *outLvl));
+    callback(env.makeTensorLevel(outTid, *outLvl), nullptr);
   }
 
   if (numloopCond == 0) {
     // Corner cases where the loop bound is defined by a *unused* operand, in
     // this case, we just generate a dense "fake" loop by iterating over the
     // synthetic tensor.
-    tidLvls.push_back(env.makeTensorLevel(env.merger().getSynTensorID(),
-                                          env.getCurrentDepth()));
+    callback(env.makeTensorLevel(env.merger().getSynTensorID(), curr), nullptr);
     numloopCond++;
   }
   // If we just need to one loop conditions and the conditions is not imposed on
@@ -1136,6 +1050,84 @@ static bool translateBitsToTidLvlPairs(
   return numloopCond == 1 && !hasNonUnique;
 }
 
+/// Starts a loop sequence at given level. Returns true if
+/// the universal loop index must be maintained at this level.
+static bool startLoopSeq(CodegenEnv &env, OpBuilder &builder, ExprId exp,
+                         LoopId curr, LatSetId lts) {
+  assert(!env.getLoopVar(curr));
+  // Emit invariants at this loop sequence level.
+  genInvariants(env, builder, exp, curr, /*isStart=*/true);
+  // Emit access pattern expansion for sparse tensor output.
+  genExpand(env, builder, curr, /*isStart=*/true);
+  // Emit further initialization at this loop sequence level.
+  const LatPointId l0 = env.set(lts)[0];
+
+  SmallVector<TensorLevel> tidLvls;
+  getAllTidLvlsInLatPoints(env, l0, curr, [&](TensorLevel tl, AffineExpr) {
+    tidLvls.emplace_back(tl);
+  });
+
+  env.emitter().enterNewLoopSeq(builder, env.op().getLoc(), tidLvls);
+
+  // Maintain the universal index only if it is actually
+  // consumed by a subsequent lattice point.
+  for (const LatPointId li : env.set(lts).drop_front())
+    if (!env.merger().hasAnySparse(env.lat(li).simple))
+      return true;
+
+  return false;
+}
+
+// Generates dense affine address for encoding.
+static void genConstantDenseAddressFromLevel(CodegenEnv &env,
+                                             OpBuilder &builder, TensorId tid,
+                                             Level startLvl) {
+  // TODO: Handle affine expression on output tensor.
+  linalg::GenericOp op = env.op();
+  assert(tid < op.getNumDpsInputs());
+  OpOperand *input = op.getDpsInputOperands()[tid];
+  const auto lvlExprs = op.getMatchingIndexingMap(input).getResults();
+  const auto enc = getSparseTensorEncoding(input->get().getType());
+  if (enc) {
+    const Location loc = op.getLoc();
+    const TensorId tid = env.makeTensorId(input->getOperandNumber());
+    const Level lvlRank = enc.getLvlRank();
+    assert(lvlExprs.size() == static_cast<size_t>(lvlRank));
+    for (Level l = startLvl; l < lvlRank; l++) {
+      AffineExpr lvlExpr = lvlExprs[l];
+      if (enc.isDenseLvl(l) && isa<AffineConstantExpr>(lvlExpr))
+        env.emitter().genDenseAffineAddress(
+            builder, loc, env.makeTensorLevel(tid, l), lvlExpr);
+      else
+        return; // break on first non-dense non-constant level
+    }
+  }
+}
+
+// We can generate address for constant affine expression before any loops
+// starting from the first level as they do not depend on anything.
+// E.g., [Dense, Dense, Sparse] -> (1, 2, d0), the addresses for the first two
+// levels can be determined before loops.
+static void genInitConstantDenseAddress(CodegenEnv &env,
+                                        RewriterBase &rewriter) {
+  for (TensorId tid = 0, e = env.op().getNumDpsInputs(); tid < e; tid++)
+    genConstantDenseAddressFromLevel(env, rewriter, tid, 0);
+}
+
+/// Returns true if the lattice bit can be iterated by a for loop.
+static bool translateBitsToTidLvlPairs(
+    CodegenEnv &env, LatPointId li, LoopId curr,
+    SmallVectorImpl<TensorLevel> &tidLvls,
+    SmallVectorImpl<std::pair<TensorLevel, AffineExpr>> &affineTidLvls) {
+  return getAllTidLvlsInLatPoints(env, li, curr,
+                                  [&](TensorLevel tl, AffineExpr exp) {
+                                    if (exp)
+                                      affineTidLvls.emplace_back(tl, exp);
+                                    else
+                                      tidLvls.emplace_back(tl);
+                                  });
+}
+
 /// Starts a single loop in current sequence.
 static std::pair<Operation *, bool> startLoop(CodegenEnv &env,
                                               OpBuilder &builder, LoopId curr,

From ffb1f20e0d8db5cd2c2a0fa2db9951e97f215b92 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Wed, 3 Jan 2024 09:07:02 +0800
Subject: [PATCH 091/313] [CodeGen] Add flag to populate target pass names
 (#76328)

`print-pipeline-passes` can show target pass names.
---
 llvm/include/llvm/Target/TargetMachine.h         | 4 +++-
 llvm/lib/Passes/PassBuilder.cpp                  | 5 +++--
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp   | 3 ++-
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h     | 3 ++-
 llvm/lib/Target/BPF/BPFTargetMachine.cpp         | 3 ++-
 llvm/lib/Target/BPF/BPFTargetMachine.h           | 3 ++-
 llvm/lib/Target/DirectX/DirectXTargetMachine.cpp | 3 ++-
 llvm/lib/Target/DirectX/DirectXTargetMachine.h   | 3 ++-
 llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp | 3 ++-
 llvm/lib/Target/Hexagon/HexagonTargetMachine.h   | 3 ++-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp     | 3 ++-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.h       | 3 ++-
 12 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 4c29f25bedf41..1fe47dec70b16 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -362,7 +362,9 @@ class TargetMachine {
   virtual TargetTransformInfo getTargetTransformInfo(const Function &F) const;
 
   /// Allow the target to modify the pass pipeline.
-  virtual void registerPassBuilderCallbacks(PassBuilder &) {}
+  // TODO: Populate all pass names by using <Target>PassRegistry.def.
+  virtual void registerPassBuilderCallbacks(PassBuilder &,
+                                            bool PopulateClassToPassNames) {}
 
   /// Allow the target to register alias analyses with the AAManager for use
   /// with the new pass manager. Only affects the "default" AAManager.
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f94bd422c6b59..439f749bda8bb 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -452,9 +452,10 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
                          std::optional<PGOOptions> PGOOpt,
                          PassInstrumentationCallbacks *PIC)
     : TM(TM), PTO(PTO), PGOOpt(PGOOpt), PIC(PIC) {
+  bool ShouldPopulateClassToPassNames = PIC && shouldPopulateClassToPassNames();
   if (TM)
-    TM->registerPassBuilderCallbacks(*this);
-  if (PIC && shouldPopulateClassToPassNames()) {
+    TM->registerPassBuilderCallbacks(*this, ShouldPopulateClassToPassNames);
+  if (ShouldPopulateClassToPassNames) {
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index fdc2077868cf9..0f3bb3e7b0d8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -620,7 +620,8 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
   AAM.registerFunctionAnalysis<AMDGPUAA>();
 }
 
-void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(
+    PassBuilder &PB, bool PopulateClassToPassNames) {
   PB.registerPipelineParsingCallback(
       [this](StringRef PassName, ModulePassManager &PM,
              ArrayRef<PassBuilder::PipelineElement>) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 9051a61e65570..99c9db3e654a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -51,7 +51,8 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
     return TLOF.get();
   }
 
-  void registerPassBuilderCallbacks(PassBuilder &PB) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool PopulateClassToPassNames) override;
   void registerDefaultAliasAnalyses(AAManager &) override;
 
   /// Get the integer value of a null pointer in the given address space.
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index ab0db576f7f72..897368417163d 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -108,7 +108,8 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new BPFPassConfig(*this, PM);
 }
 
-void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void BPFTargetMachine::registerPassBuilderCallbacks(
+    PassBuilder &PB, bool PopulateClassToPassNames) {
   PB.registerPipelineParsingCallback(
       [](StringRef PassName, FunctionPassManager &FPM,
          ArrayRef<PassBuilder::PipelineElement>) {
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h
index 4e6adc722e76a..0a28394463b26 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -42,7 +42,8 @@ class BPFTargetMachine : public LLVMTargetMachine {
     return TLOF.get();
   }
 
-  void registerPassBuilderCallbacks(PassBuilder &PB) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool PopulateClassToPassNames) override;
 };
 }
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index d5cb488f2fdef..06938f8c74f15 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -100,7 +100,8 @@ DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT,
 
 DirectXTargetMachine::~DirectXTargetMachine() {}
 
-void DirectXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void DirectXTargetMachine::registerPassBuilderCallbacks(
+    PassBuilder &PB, bool PopulateClassToPassNames) {
   PB.registerPipelineParsingCallback(
       [](StringRef PassName, ModulePassManager &PM,
          ArrayRef<PassBuilder::PipelineElement>) {
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.h b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
index d04c375b2736d..428beaf61cd0a 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
@@ -47,7 +47,8 @@ class DirectXTargetMachine : public LLVMTargetMachine {
   }
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
-  void registerPassBuilderCallbacks(PassBuilder &PB) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool PopulateClassToPassNames) override;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 590e464e1653a..e7a692d67ba01 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -274,7 +274,8 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
-void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void HexagonTargetMachine::registerPassBuilderCallbacks(
+    PassBuilder &PB, bool PopulateClassToPassNames) {
   PB.registerLateLoopOptimizationsEPCallback(
       [=](LoopPassManager &LPM, OptimizationLevel Level) {
         LPM.addPass(HexagonLoopIdiomRecognitionPass());
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index dddd79ad1fcfc..c5fed0cd65a81 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -34,7 +34,8 @@ class HexagonTargetMachine : public LLVMTargetMachine {
   ~HexagonTargetMachine() override;
   const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
 
-  void registerPassBuilderCallbacks(PassBuilder &PB) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool PopulateClassToPassNames) override;
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 8d895762fbe1d..fad69f5e80a7a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -225,7 +225,8 @@ void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
   AAM.registerFunctionAnalysis<NVPTXAA>();
 }
 
-void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void NVPTXTargetMachine::registerPassBuilderCallbacks(
+    PassBuilder &PB, bool PopulateClassToPassNames) {
   PB.registerPipelineParsingCallback(
       [](StringRef PassName, FunctionPassManager &PM,
          ArrayRef<PassBuilder::PipelineElement>) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index cfdd8da9b7652..9e6bf929badba 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -69,7 +69,8 @@ class NVPTXTargetMachine : public LLVMTargetMachine {
 
   void registerDefaultAliasAnalyses(AAManager &AAM) override;
 
-  void registerPassBuilderCallbacks(PassBuilder &PB) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool PopulateClassToPassNames) override;
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
 

From 7d81e072712f4e6a150561b5538ccebda289aa13 Mon Sep 17 00:00:00 2001
From: Quentin Dian <dianqk@dianqk.net>
Date: Wed, 3 Jan 2024 09:22:13 +0800
Subject: [PATCH 092/313] [SimplifyCFG] When only one case value is missing,
 replace default with that case (#76669)

When the default branch is the last case, we can transform that branch
into a concrete branch with an unreachable default branch.

```llvm
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i64 @src(i64 %0) {
  %2 = urem i64 %0, 4
  switch i64 %2, label %5 [
    i64 1, label %3
    i64 2, label %3
    i64 3, label %4
  ]

3:                                                ; preds = %1, %1
  br label %5

4:                                                ; preds = %1
  br label %5

5:                                                ; preds = %1, %4, %3
  %.0 = phi i64 [ 2, %4 ], [ 1, %3 ], [ 0, %1 ]
  ret i64 %.0
}

define i64 @tgt(i64 %0) {
  %2 = urem i64 %0, 4
  switch i64 %2, label %unreachable [
    i64 0, label %5
    i64 1, label %3
    i64 2, label %3
    i64 3, label %4
  ]

unreachable:                              ; preds = %1
  unreachable

3:                                                ; preds = %1, %1
  br label %5

4:                                                ; preds = %1
  br label %5

5:                                                ; preds = %1, %4, %3
  %.0 = phi i64 [ 2, %4 ], [ 1, %3 ], [ 0, %1 ]
  ret i64 %.0
}
```

Alive2: https://alive2.llvm.org/ce/z/Y-PGXv

After transform to a lookup table, I believe `tgt` is better code.

The final instructions are as follows:

```asm
src:                                    # @src
        and     edi, 3
        lea     rax, [rdi - 1]
        cmp     rax, 2
        ja      .LBB0_1
        mov     rax, qword ptr [8*rdi + .Lswitch.table.src-8]
        ret
.LBB0_1:
        xor     eax, eax
        ret
tgt:                                    # @tgt
        and     edi, 3
        mov     rax, qword ptr [8*rdi + .Lswitch.table.tgt]
        ret
.Lswitch.table.src:
        .quad   1                               # 0x1
        .quad   1                               # 0x1
        .quad   2                               # 0x2

.Lswitch.table.tgt:
        .quad   0                               # 0x0
        .quad   1                               # 0x1
        .quad   1                               # 0x1
        .quad   2                               # 0x2
```

Godbolt: https://llvm.godbolt.org/z/borME8znd

Closes #73446.
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 35 ++++++++---
 .../switch-dead-default-lookup-table.ll       | 61 +++++++++++++++++++
 .../SimplifyCFG/switch-dead-default.ll        | 56 ++++++++++++++++-
 3 files changed, 142 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 55e375670cc61..61d891d65346b 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5414,11 +5414,13 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
 }
 
 static void createUnreachableSwitchDefault(SwitchInst *Switch,
-                                           DomTreeUpdater *DTU) {
+                                           DomTreeUpdater *DTU,
+                                           bool RemoveOrigDefaultBlock = true) {
   LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
   auto *BB = Switch->getParent();
   auto *OrigDefaultBlock = Switch->getDefaultDest();
-  OrigDefaultBlock->removePredecessor(BB);
+  if (RemoveOrigDefaultBlock)
+    OrigDefaultBlock->removePredecessor(BB);
   BasicBlock *NewDefaultBlock = BasicBlock::Create(
       BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
       OrigDefaultBlock);
@@ -5427,7 +5429,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 2> Updates;
     Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
-    if (!is_contained(successors(BB), OrigDefaultBlock))
+    if (RemoveOrigDefaultBlock &&
+        !is_contained(successors(BB), OrigDefaultBlock))
       Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
     DTU->applyUpdates(Updates);
   }
@@ -5609,10 +5612,28 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
       Known.getBitWidth() - (Known.Zero | Known.One).popcount();
   assert(NumUnknownBits <= Known.getBitWidth());
   if (HasDefault && DeadCases.empty() &&
-      NumUnknownBits < 64 /* avoid overflow */ &&
-      SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    createUnreachableSwitchDefault(SI, DTU);
-    return true;
+      NumUnknownBits < 64 /* avoid overflow */) {
+    uint64_t AllNumCases = 1ULL << NumUnknownBits;
+    if (SI->getNumCases() == AllNumCases) {
+      createUnreachableSwitchDefault(SI, DTU);
+      return true;
+    }
+    // When only one case value is missing, replace default with that case.
+    // Eliminating the default branch will provide more opportunities for
+    // optimization, such as lookup tables.
+    if (SI->getNumCases() == AllNumCases - 1) {
+      assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+      uint64_t MissingCaseVal = 0;
+      for (const auto &Case : SI->cases())
+        MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+      auto *MissingCase =
+          cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
+      SwitchInstProfUpdateWrapper SIW(*SI);
+      SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
+      createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
+      SIW.setSuccessorWeight(0, 0);
+      return true;
+    }
   }
 
   if (DeadCases.empty())
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll
new file mode 100644
index 0000000000000..bead0dc4c567a
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+define i64 @test_1(i64 %0) {
+; CHECK-LABEL: define i64 @test_1(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  switch.lookup:
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i64], ptr @switch.table.test_1, i32 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i64, ptr [[SWITCH_GEP]], align 8
+; CHECK-NEXT:    ret i64 [[SWITCH_LOAD]]
+;
+  %2 = urem i64 %0, 4
+  switch i64 %2, label %5 [
+  i64 1, label %3
+  i64 2, label %3
+  i64 3, label %4
+  ]
+
+3:
+  br label %5
+
+4:
+  br label %5
+
+5:
+  %.0 = phi i64 [ 2, %4 ], [ 1, %3 ], [ 0, %1 ]
+  ret i64 %.0
+}
+
+
+define i64 @test_2(i64 %0) {
+; CHECK-LABEL: define i64 @test_2(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  switch.lookup:
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %2 = urem i64 %0, 4
+  switch i64 %2, label %6 [
+  i64 1, label %3
+  i64 2, label %4
+  i64 3, label %5
+  ]
+
+3:
+  br label %6
+
+4:
+  br label %6
+
+5:
+  br label %6
+
+6:
+  %.0 = phi i64 [ 0, %1 ], [ 1, %3 ], [ 2, %4 ], [ 3, %5 ]
+  ret i64 %.0
+}
+
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
index 7c0d5e4f2b653..e30a535c52323 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
@@ -79,15 +79,15 @@ default:
   ret void
 }
 
-; This one is a negative test - we know the value of the default,
-; but that's about it
+; We can replace the default branch with case 3 since it is the only case that is missing.
 define void @test3(i2 %a) {
 ; CHECK-LABEL: define void @test3(
 ; CHECK-SAME: i2 [[A:%.*]]) {
-; CHECK-NEXT:    switch i2 [[A]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    switch i2 [[A]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
 ; CHECK-NEXT:      i2 0, label [[CASE0:%.*]]
 ; CHECK-NEXT:      i2 1, label [[CASE1:%.*]]
 ; CHECK-NEXT:      i2 -2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i2 -1, label [[DEFAULT:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -100,6 +100,8 @@ define void @test3(i2 %a) {
 ; CHECK:       case2:
 ; CHECK-NEXT:    call void @foo(i32 2)
 ; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       .unreachabledefault:
+; CHECK-NEXT:    unreachable
 ; CHECK:       default:
 ; CHECK-NEXT:    call void @foo(i32 3)
 ; CHECK-NEXT:    br label [[COMMON_RET]]
@@ -122,6 +124,50 @@ default:
   ret void
 }
 
+define void @test3_prof(i2 %a) {
+; CHECK-LABEL: define void @test3_prof(
+; CHECK-SAME: i2 [[A:%.*]]) {
+; CHECK-NEXT:    switch i2 [[A]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
+; CHECK-NEXT:      i2 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i2 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i2 -2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i2 -1, label [[DEFAULT:%.*]]
+; CHECK-NEXT:    ], !prof [[PROF0:![0-9]+]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       case0:
+; CHECK-NEXT:    call void @foo(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       case1:
+; CHECK-NEXT:    call void @foo(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       case2:
+; CHECK-NEXT:    call void @foo(i32 2)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       .unreachabledefault:
+; CHECK-NEXT:    unreachable
+; CHECK:       default:
+; CHECK-NEXT:    call void @foo(i32 3)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i2 %a, label %default [i2 0, label %case0
+  i2 1, label %case1
+  i2 2, label %case2], !prof !0
+
+case0:
+  call void @foo(i32 0)
+  ret void
+case1:
+  call void @foo(i32 1)
+  ret void
+case2:
+  call void @foo(i32 2)
+  ret void
+default:
+  call void @foo(i32 3)
+  ret void
+}
+
 ; Negative test - check for possible overflow when computing
 ; number of possible cases.
 define void @test4(i128 %a) {
@@ -267,3 +313,7 @@ default:
 
 declare void @llvm.assume(i1)
 
+!0 = !{!"branch_weights", i32 8, i32 4, i32 2, i32 1}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 0, i32 4, i32 2, i32 1, i32 8}
+;.

From 0731567a31e4ade97c27801045156a88c4589704 Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Wed, 3 Jan 2024 09:28:25 +0800
Subject: [PATCH 093/313] [MC][RISCV][LoongArch] Add AlignFragment size if
 layout is available and not need insert nops (#76552)

Due to delayed decision for ADD/SUB relocations, RISCV and LoongArch may
go slow fragment walk path with available layout. When RISCV (or
LoongArch in the future) don't need insert nops, that means relax is
disabled. With available layout and not needing insert nops, the size of
AlignFragment should be a constant. So we can add it to Displacement for
folding A-B.
---
 llvm/lib/MC/MCExpr.cpp                        |  6 ++++
 llvm/test/MC/LoongArch/Misc/cfi-advance.s     | 27 ++++++++++++++++++
 .../MC/LoongArch/Relocations/relax-addsub.s   | 17 +++--------
 llvm/test/MC/RISCV/cfi-advance.s              |  7 +++++
 llvm/test/MC/RISCV/fixups-expr.s              | 28 +++++++++++++++++++
 5 files changed, 72 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/MC/LoongArch/Misc/cfi-advance.s

diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index a85182aa06ad5..9dae026535ccf 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -704,8 +704,14 @@ static void AttemptToFoldSymbolOffsetDifference(
       }
 
       int64_t Num;
+      unsigned Count;
       if (DF) {
         Displacement += DF->getContents().size();
+      } else if (auto *AF = dyn_cast<MCAlignFragment>(FI);
+                 AF && Layout &&
+                 !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign(
+                     *AF, Count)) {
+        Displacement += Asm->computeFragmentSize(*Layout, *AF);
       } else if (auto *FF = dyn_cast<MCFillFragment>(FI);
                  FF && FF->getNumValues().evaluateAsAbsolute(Num)) {
         Displacement += Num * FF->getValueSize();
diff --git a/llvm/test/MC/LoongArch/Misc/cfi-advance.s b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
new file mode 100644
index 0000000000000..662c43e6bceaf
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=-relax %s -o %t.o
+# RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=RELOC %s
+# RUN: llvm-dwarfdump --debug-frame %t.o | FileCheck --check-prefix=DWARFDUMP %s
+
+# RELOC:       Relocations [
+# RELOC-NEXT:    .rela.eh_frame {
+# RELOC-NEXT:       0x1C R_LARCH_32_PCREL .text 0x0
+# RELOC-NEXT:    }
+# RELOC-NEXT:  ]
+# DWARFDUMP:       DW_CFA_advance_loc: 4
+# DWARFDUMP-NEXT:  DW_CFA_def_cfa_offset: +8
+# DWARFDUMP-NEXT:  DW_CFA_advance_loc: 8
+# DWARFDUMP-NEXT:  DW_CFA_def_cfa_offset: +8
+
+        .text
+        .globl test
+        .p2align 2
+        .type   test,@function
+test:
+        .cfi_startproc
+        nop
+        .cfi_def_cfa_offset 8
+        .p2align 3
+        nop
+        .cfi_def_cfa_offset 8
+        nop
+        .cfi_endproc
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
index c4454f5bb98d1..14922657ae89e 100644
--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
@@ -23,14 +23,6 @@
 # RELAX-NEXT:      0x14 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:    }
 # RELAX-NEXT:    Section ({{.*}}) .rela.data {
-# RELAX-NEXT:      0xF R_LARCH_ADD8 .L3 0x0
-# RELAX-NEXT:      0xF R_LARCH_SUB8 .L2 0x0
-# RELAX-NEXT:      0x10 R_LARCH_ADD16 .L3 0x0
-# RELAX-NEXT:      0x10 R_LARCH_SUB16 .L2 0x0
-# RELAX-NEXT:      0x12 R_LARCH_ADD32 .L3 0x0
-# RELAX-NEXT:      0x12 R_LARCH_SUB32 .L2 0x0
-# RELAX-NEXT:      0x16 R_LARCH_ADD64 .L3 0x0
-# RELAX-NEXT:      0x16 R_LARCH_SUB64 .L2 0x0
 # RELAX-NEXT:      0x1E R_LARCH_ADD8 .L4 0x0
 # RELAX-NEXT:      0x1E R_LARCH_SUB8 .L3 0x0
 # RELAX-NEXT:      0x1F R_LARCH_ADD16 .L4 0x0
@@ -43,8 +35,8 @@
 # RELAX-NEXT:  ]
 
 # RELAX:      Hex dump of section '.data':
-# RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000000
-# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000000
+# RELAX-NEXT: 0x00000000 04040004 00000004 00000000 0000000c
+# RELAX-NEXT: 0x00000010 0c000c00 00000c00 00000000 00000000
 # RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00
 
 .text
@@ -63,13 +55,12 @@
 .short .L2 - .L1
 .word  .L2 - .L1
 .dword .L2 - .L1
-## With relaxation, emit relocs because of the .align making the diff variable.
-## TODO Handle alignment directive. Why they emit relocs now? They returns
-## without folding symbols offset in AttemptToFoldSymbolOffsetDifference().
+## TODO Handle alignment directive.
 .byte  .L3 - .L2
 .short .L3 - .L2
 .word  .L3 - .L2
 .dword .L3 - .L2
+## With relaxation, emit relocs because the la.pcrel makes the diff variable.
 .byte  .L4 - .L3
 .short .L4 - .L3
 .word  .L4 - .L3
diff --git a/llvm/test/MC/RISCV/cfi-advance.s b/llvm/test/MC/RISCV/cfi-advance.s
index d9224fd2ae1c9..c4af390be757d 100644
--- a/llvm/test/MC/RISCV/cfi-advance.s
+++ b/llvm/test/MC/RISCV/cfi-advance.s
@@ -5,12 +5,16 @@
 
 # CHECK:      .rela.eh_frame {
 # CHECK-NEXT:   0x1C R_RISCV_32_PCREL <null> 0x0
+# CHECK-NEXT:   0x35 R_RISCV_SET6 <null> 0x0
+# CHECK-NEXT:   0x35 R_RISCV_SUB6 <null> 0x0
 # CHECK-NEXT: }
 # CHECK-DWARFDUMP: DW_CFA_advance_loc1: 104
 # CHECK-DWARFDUMP-NEXT: DW_CFA_def_cfa_offset: +8
 # CHECK-DWARFDUMP-NEXT: DW_CFA_advance_loc2: 259
 # CHECK-DWARFDUMP-NEXT: DW_CFA_def_cfa_offset: +8
 # CHECK-DWARFDUMP-NEXT: DW_CFA_advance_loc4: 65539
+# CHECK-DWARFDUMP-NEXT: DW_CFA_def_cfa_offset: +8
+# CHECK-DWARFDUMP-NEXT: DW_CFA_advance_loc: 10
 # CHECK-DWARFDUMP-NEXT: DW_CFA_def_cfa_offset: +8
         .text
         .globl  test                            # -- Begin function test
@@ -28,4 +32,7 @@ test:
         .zero 65535, 0x90
         .cfi_def_cfa_offset 8
         nop
+        .p2align 3
+        .cfi_def_cfa_offset 8
+        nop
         .cfi_endproc
diff --git a/llvm/test/MC/RISCV/fixups-expr.s b/llvm/test/MC/RISCV/fixups-expr.s
index 8a02d29de1ab5..63e7f2e9358da 100644
--- a/llvm/test/MC/RISCV/fixups-expr.s
+++ b/llvm/test/MC/RISCV/fixups-expr.s
@@ -16,11 +16,15 @@
 
 .globl G1
 .globl G2
+.globl G3
 .L1:
 G1:
   call extern
 .L2:
 G2:
+  .p2align 3
+.L3:
+G3:
 
 .data
 .dword .L2-.L1
@@ -31,6 +35,14 @@ G2:
 .half G2-G1
 .byte .L2-.L1
 .byte G2-G1
+.dword .L3-.L2
+.dword G3-G2
+.word .L3-.L2
+.word G3-G2
+.half .L3-.L2
+.half G3-G2
+.byte .L3-.L2
+.byte G3-G2
 # RELAX:      .rela.data {
 # RELAX-NEXT:   0x0 R_RISCV_ADD64 .L2 0x0
 # RELAX-NEXT:   0x0 R_RISCV_SUB64 .L1 0x0
@@ -48,4 +60,20 @@ G2:
 # RELAX-NEXT:   0x1C R_RISCV_SUB8 .L1 0x0
 # RELAX-NEXT:   0x1D R_RISCV_ADD8 G2 0x0
 # RELAX-NEXT:   0x1D R_RISCV_SUB8 G1 0x0
+# RELAX-NEXT:   0x1E R_RISCV_ADD64 .L3 0x0
+# RELAX-NEXT:   0x1E R_RISCV_SUB64 .L2 0x0
+# RELAX-NEXT:   0x26 R_RISCV_ADD64 G3 0x0
+# RELAX-NEXT:   0x26 R_RISCV_SUB64 G2 0x0
+# RELAX-NEXT:   0x2E R_RISCV_ADD32 .L3 0x0
+# RELAX-NEXT:   0x2E R_RISCV_SUB32 .L2 0x0
+# RELAX-NEXT:   0x32 R_RISCV_ADD32 G3 0x0
+# RELAX-NEXT:   0x32 R_RISCV_SUB32 G2 0x0
+# RELAX-NEXT:   0x36 R_RISCV_ADD16 .L3 0x0
+# RELAX-NEXT:   0x36 R_RISCV_SUB16 .L2 0x0
+# RELAX-NEXT:   0x38 R_RISCV_ADD16 G3 0x0
+# RELAX-NEXT:   0x38 R_RISCV_SUB16 G2 0x0
+# RELAX-NEXT:   0x3A R_RISCV_ADD8 .L3 0x0
+# RELAX-NEXT:   0x3A R_RISCV_SUB8 .L2 0x0
+# RELAX-NEXT:   0x3B R_RISCV_ADD8 G3 0x0
+# RELAX-NEXT:   0x3B R_RISCV_SUB8 G2 0x0
 # RELAX-NEXT: }

From ab43cf26cac559187a0891c6978f07cf72ea7682 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Wed, 3 Jan 2024 09:28:21 +0800
Subject: [PATCH 094/313] [mlir][mesh] Fix -Wunused-variable in Spmdization.cpp
 (NFC)

llvm-project/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp:573:14:
 error: unused variable 'targetShardType' [-Werror,-Wunused-variable]
  ShapedType targetShardType =
             ^
1 error generated.
---
 mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
index de8b3a98df998..8d7e89662131a 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
@@ -570,7 +570,7 @@ reshardOn1DMesh(ImplicitLocOpBuilder &builder, ClusterOp mesh,
                 TypedValue<ShapedType> sourceShard) {
   assert(sourceShard.getType() ==
          shardShapedType(sourceUnshardedValue.getType(), mesh, sourceSharding));
-  ShapedType targetShardType =
+  [[maybe_unused]] ShapedType targetShardType =
       shardShapedType(sourceUnshardedValue.getType(), mesh, targetSharding);
   assert(sourceShard.getType().getRank() == targetShardType.getRank());
   assert(mesh.getRank() == 1 && "Only 1D meshes are currently supported.");

From a258544754bb119f856bc1163a2af37e6c9a8446 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 2 Jan 2024 17:57:21 -0800
Subject: [PATCH 095/313] [bazel][mesh] Add deps for 1D device mesh resharding
 spmdization

Added in 1a8fb887197caf709710bedf88ce95ffb0605c56
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel      | 5 +++++
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index dabf274a6d9a0..2a56b2d6f0373 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3197,6 +3197,7 @@ gentbl_cc_library(
     td_file = "include/mlir/Dialect/Mesh/IR/MeshOps.td",
     deps = [
         ":MeshTdFiles",
+        ":ShapeOpsTdFiles",
     ],
 )
 
@@ -3275,12 +3276,16 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":ControlFlowDialect",
+        ":DialectUtils",
         ":FuncDialect",
         ":IR",
         ":MeshDialect",
         ":MeshShardingInterface",
         ":MeshTransformsPassIncGen",
         ":Pass",
+        ":Support",
+        ":TensorDialect",
         ":TransformUtils",
         "//llvm:Support",
     ],
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 29a7ce7168fcf..656a1d66208b5 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -789,9 +789,12 @@ cc_library(
         ":TestDialect",
         "//mlir:ArithDialect",
         "//mlir:FuncDialect",
+        "//mlir:IR",
         "//mlir:MeshDialect",
         "//mlir:MeshTransforms",
         "//mlir:Pass",
+        "//mlir:SPIRVDialect",
+        "//mlir:Support",
         "//mlir:Transforms",
     ],
 )

From 82fabd537f4b6cd24e639ca29df95643f215f0ef Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 3 Jan 2024 10:41:15 +0800
Subject: [PATCH 096/313] [X86][NFC] Remove redundant "NAME#" in
 X86InstrArithmetic.td

---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 576 +++++++++++-----------
 1 file changed, 288 insertions(+), 288 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 6b0c1b8c28c95..08f5a8860b84a 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -350,212 +350,212 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   let isCommutable = CommutableRR,
       isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
     let Predicates = [NoNDD] in {
-      def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
-      def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16;
-      def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32;
-      def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+      def 8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+      def 16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16;
+      def 32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32;
+      def 64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
     }
     let Predicates = [HasNDD, In64BitMode] in {
-      def NAME#8rr_ND  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>;
-      def NAME#16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD;
-      def NAME#32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>;
-      def NAME#64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>;
-      def NAME#8rr_NF_ND  : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF;
-      def NAME#16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD;
-      def NAME#32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF;
-      def NAME#64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF;
+      def 8rr_ND  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>;
+      def 16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD;
+      def 32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>;
+      def 64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>;
+      def 8rr_NF_ND  : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF;
+      def 16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD;
+      def 32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF;
+      def 64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF;
     }
     let Predicates = [In64BitMode] in {
-      def NAME#8rr_NF  : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF;
-      def NAME#16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD;
-      def NAME#32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF;
-      def NAME#64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF;
-      def NAME#8rr_EVEX  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
-      def NAME#16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
-      def NAME#32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
-      def NAME#64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+      def 8rr_NF  : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF;
+      def 16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD;
+      def 32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF;
+      def 64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF;
+      def 8rr_EVEX  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+      def 16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+      def 32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+      def 64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
     }
   }
 
-    def NAME#8rr_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
-    def NAME#16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
-    def NAME#32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
-    def NAME#64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+    def 8rr_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+    def 16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+    def 32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+    def 64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
     let Predicates = [In64BitMode] in {
-      def NAME#8rr_EVEX_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
-      def NAME#16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
-      def NAME#32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
-      def NAME#64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
-      def NAME#8rr_ND_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
-      def NAME#16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
-      def NAME#32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
-      def NAME#64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
-      def NAME#8rr_NF_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF;
-      def NAME#16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD;
-      def NAME#32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF;
-      def NAME#64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF;
-      def NAME#8rr_NF_ND_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
-      def NAME#16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
-      def NAME#32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
-      def NAME#64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+      def 8rr_EVEX_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
+      def 16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
+      def 32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
+      def 64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
+      def 8rr_ND_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
+      def 16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
+      def 32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
+      def 64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
+      def 8rr_NF_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF;
+      def 16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD;
+      def 32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF;
+      def 64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF;
+      def 8rr_NF_ND_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
+      def 16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
+      def 32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
+      def 64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
     }
 
     let Predicates = [NoNDD] in {
-      def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
-      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16;
-      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32;
-      def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+      def 8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+      def 16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16;
+      def 32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32;
+      def 64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
     }
     let Predicates = [HasNDD, In64BitMode] in {
-      def NAME#8rm_ND  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>;
-      def NAME#16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD;
-      def NAME#32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>;
-      def NAME#64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>;
-      def NAME#8rm_NF_ND  : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
-      def NAME#16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
-      def NAME#32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
-      def NAME#64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+      def 8rm_ND  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>;
+      def 16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD;
+      def 32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>;
+      def 64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>;
+      def 8rm_NF_ND  : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
+      def 16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
+      def 32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
+      def 64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
     }
     let Predicates = [In64BitMode] in {
-      def NAME#8rm_NF  : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF;
-      def NAME#16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD;
-      def NAME#32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF;
-      def NAME#64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF;
-      def NAME#8rm_EVEX  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL;
-      def NAME#16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD;
-      def NAME#32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL;
-      def NAME#64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL;
+      def 8rm_NF  : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF;
+      def 16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD;
+      def 32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF;
+      def 64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF;
+      def 8rm_EVEX  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL;
+      def 16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD;
+      def 32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL;
+      def 64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL;
     }
 
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
       let Predicates = [NoNDD] in {
         // NOTE: These are order specific, we want the ri8 forms to be listed
         // first so that they are slightly preferred to the ri forms.
-        def NAME#16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
-        def NAME#32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
-        def NAME#64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
-        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
-        def NAME#16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
-        def NAME#32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
-        def NAME#64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
+        def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+        def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+        def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
+        def 8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+        def 16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
+        def 32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
+        def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
       }
       let Predicates = [HasNDD, In64BitMode] in {
-        def NAME#16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
-        def NAME#32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
-        def NAME#64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
-        def NAME#8ri_ND   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>;
-        def NAME#16ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD;
-        def NAME#32ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>;
-        def NAME#64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
-        def NAME#16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
-        def NAME#32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
-        def NAME#64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
-        def NAME#8ri_NF_ND  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF;
-        def NAME#16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
-        def NAME#32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
-        def NAME#64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
+        def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
+        def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
+        def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
+        def 8ri_ND   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>;
+        def 16ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD;
+        def 32ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>;
+        def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
+        def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
+        def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+        def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
+        def 8ri_NF_ND  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF;
+        def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
+        def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+        def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
       }
       let Predicates = [In64BitMode] in {
-        def NAME#16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD;
-        def NAME#32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF;
-        def NAME#64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF;
-        def NAME#8ri_NF  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF;
-        def NAME#16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD;
-        def NAME#32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF;
-        def NAME#64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF;
-        def NAME#16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
-        def NAME#32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
-        def NAME#64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
-        def NAME#8ri_EVEX   : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL;
-        def NAME#16ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD;
-        def NAME#32ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL;
-        def NAME#64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL;
+        def 16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD;
+        def 32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF;
+        def 64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF;
+        def 8ri_NF  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF;
+        def 16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD;
+        def 32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF;
+        def 64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF;
+        def 16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
+        def 32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
+        def 64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
+        def 8ri_EVEX   : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL;
+        def 16ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD;
+        def 32ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL;
+        def 64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL;
       }
     }
 
-    def NAME#8mr    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
-    def NAME#32mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
-    def NAME#64mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
+    def 8mr    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def 16mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+    def 32mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+    def 64mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
     let Predicates = [HasNDD, In64BitMode] in {
-    def NAME#8mr_ND    : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
-    def NAME#32mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>;
-    def NAME#64mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>;
-    def NAME#8mr_NF_ND    : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF;
-    def NAME#16mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD;
-    def NAME#32mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF;
-    def NAME#64mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF;
+    def 8mr_ND    : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def 16mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
+    def 32mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>;
+    def 64mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>;
+    def 8mr_NF_ND    : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF;
+    def 16mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD;
+    def 32mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF;
+    def 64mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF;
   }
   let Predicates = [In64BitMode] in {
-    def NAME#8mr_NF    : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF;
-    def NAME#16mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD;
-    def NAME#32mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF;
-    def NAME#64mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF;
-    def NAME#8mr_EVEX    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
-    def NAME#16mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
-    def NAME#32mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
-    def NAME#64mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+    def 8mr_NF    : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF;
+    def 16mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD;
+    def 32mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF;
+    def 64mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF;
+    def 8mr_EVEX    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+    def 16mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+    def 32mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+    def 64mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
   }
 
   // NOTE: These are order specific, we want the mi8 forms to be listed
   // first so that they are slightly preferred to the mi forms.
-  def NAME#16mi8  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16;
-  def NAME#32mi8  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32;
+  def 16mi8  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16;
+  def 32mi8  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
-    def NAME#64mi8  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>;
-  def NAME#8mi    : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi   : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
-  def NAME#32mi   : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+    def 64mi8  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>;
+  def 8mi    : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def 16mi   : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def 32mi   : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
-    def NAME#64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+    def 64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
   let Predicates = [HasNDD, In64BitMode] in {
-    def NAME#16mi8_ND  : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD;
-    def NAME#32mi8_ND  : BinOpMI8_RF<mnemonic, Xi32, MemMRM>;
-    def NAME#64mi8_ND  : BinOpMI8_RF<mnemonic, Xi64, MemMRM>;
-    def NAME#8mi_ND    : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-    def NAME#16mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
-    def NAME#32mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
-    def NAME#64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
-    def NAME#16mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD;
-    def NAME#32mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF;
-    def NAME#64mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF;
-    def NAME#8mi_NF_ND    : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF;
-    def NAME#16mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
-    def NAME#32mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF;
-    def NAME#64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF;
+    def 16mi8_ND  : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD;
+    def 32mi8_ND  : BinOpMI8_RF<mnemonic, Xi32, MemMRM>;
+    def 64mi8_ND  : BinOpMI8_RF<mnemonic, Xi64, MemMRM>;
+    def 8mi_ND    : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def 16mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
+    def 32mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
+    def 64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+    def 16mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD;
+    def 32mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF;
+    def 64mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF;
+    def 8mi_NF_ND    : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF;
+    def 16mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
+    def 32mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF;
+    def 64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF;
   }
   let Predicates = [In64BitMode] in {
-    def NAME#16mi8_NF  : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD;
-    def NAME#32mi8_NF  : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF;
-    def NAME#64mi8_NF  : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF;
-    def NAME#8mi_NF    : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF;
-    def NAME#16mi_NF   : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
-    def NAME#32mi_NF   : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF;
-    def NAME#64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF;
-    def NAME#16mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD;
-    def NAME#32mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL;
-    def NAME#64mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL;
-    def NAME#8mi_EVEX    : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL;
-    def NAME#16mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD;
-    def NAME#32mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL;
-    def NAME#64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL;
+    def 16mi8_NF  : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD;
+    def 32mi8_NF  : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF;
+    def 64mi8_NF  : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF;
+    def 8mi_NF    : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF;
+    def 16mi_NF   : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
+    def 32mi_NF   : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF;
+    def 64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF;
+    def 16mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD;
+    def 32mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL;
+    def 64mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL;
+    def 8mi_EVEX    : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL;
+    def 16mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD;
+    def 32mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL;
+    def 64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL;
   }
 
   // These are for the disassembler since 0x82 opcode behaves like 0x80, but
   // not in 64-bit mode.
   let Predicates = [Not64BitMode] in {
-  def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
-  def NAME#8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+  def 8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+  def 8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
   }
 
-  def NAME#8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,
+  def 8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,
                             "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,
+  def 16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,
                               "{$src, %ax|ax, $src}">, OpSize16;
-  def NAME#32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,
+  def 32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,
                               "{$src, %eax|eax, $src}">, OpSize32;
-  def NAME#64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,
+  def 64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,
                               "{$src, %rax|rax, $src}">;
 }
 
@@ -571,162 +571,162 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                            bit ConvertibleToThreeAddress> {
   let isCommutable = CommutableRR in {
     let Predicates = [NoNDD] in {
-      def NAME#8rr  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+      def 8rr  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
       let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-        def NAME#16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
-        def NAME#32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
-        def NAME#64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
+        def 16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+        def 32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+        def 64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
       }
     }
     let Predicates = [HasNDD, In64BitMode] in {
-      def NAME#8rr_ND  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>;
+      def 8rr_ND  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>;
       let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-        def NAME#16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD;
-        def NAME#32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>;
-        def NAME#64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>;
+        def 16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD;
+        def 32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>;
+        def 64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>;
       }
     }
   } // isCommutable
 
   let Predicates = [In64BitMode] in {
-    def NAME#8rr_EVEX  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
-    def NAME#16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
-    def NAME#32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
-    def NAME#64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+    def 8rr_EVEX  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+    def 16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+    def 32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+    def 64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
   }
 
-  def NAME#8rr_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>;
-  def NAME#16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
-  def NAME#32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
-  def NAME#64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+  def 8rr_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+  def 16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+  def 32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+  def 64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>;
   let Predicates = [In64BitMode] in {
-    def NAME#8rr_ND_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
-    def NAME#16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
-    def NAME#32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
-    def NAME#64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
-    def NAME#8rr_EVEX_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
-    def NAME#16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
-    def NAME#32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
-    def NAME#64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
+    def 8rr_ND_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
+    def 16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
+    def 32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
+    def 64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
+    def 8rr_EVEX_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
+    def 16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
+    def 32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
+    def 64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
   }
 
   let Predicates = [NoNDD] in {
-    def NAME#8rm   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>;
-    def NAME#16rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
-    def NAME#32rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
-    def NAME#64rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>;
+    def 8rm   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>;
+    def 16rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
+    def 32rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
+    def 64rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>;
   }
   let Predicates = [HasNDD, In64BitMode] in {
-    def NAME#8rm_ND   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>;
-    def NAME#16rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD;
-    def NAME#32rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>;
-    def NAME#64rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>;
+    def 8rm_ND   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>;
+    def 16rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD;
+    def 32rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>;
+    def 64rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>;
   }
   let Predicates = [In64BitMode] in {
-    def NAME#8rm_EVEX   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL;
-    def NAME#16rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD;
-    def NAME#32rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL;
-    def NAME#64rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL;
+    def 8rm_EVEX   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL;
+    def 16rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD;
+    def 32rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL;
+    def 64rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL;
   }
 
   let Predicates = [NoNDD] in {
-    def NAME#8ri   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+    def 8ri   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
       // NOTE: These are order specific, we want the ri8 forms to be listed
       // first so that they are slightly preferred to the ri forms.
-      def NAME#16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
-      def NAME#32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
-      def NAME#64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>;
+      def 16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+      def 32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+      def 64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>;
 
-      def NAME#16ri  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
-      def NAME#32ri  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
-      def NAME#64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>;
+      def 16ri  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+      def 32ri  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
+      def 64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>;
     }
   }
 
   let Predicates = [HasNDD, In64BitMode] in {
-    def NAME#8ri_ND   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>;
+    def 8ri_ND   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>;
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-      def NAME#16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
-      def NAME#32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
-      def NAME#64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
-      def NAME#16ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD;
-      def NAME#32ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>;
-      def NAME#64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>;
+      def 16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
+      def 32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
+      def 64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
+      def 16ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD;
+      def 32ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>;
+      def 64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>;
     }
   }
   let Predicates = [In64BitMode] in {
-    def NAME#8ri_EVEX   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL;
-    def NAME#16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
-    def NAME#32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
-    def NAME#64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
-    def NAME#16ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD;
-    def NAME#32ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL;
-    def NAME#64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL;
+    def 8ri_EVEX   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL;
+    def 16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
+    def 32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
+    def 64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
+    def 16ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD;
+    def 32ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL;
+    def 64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL;
   }
 
-  def NAME#8mr    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>;
-  def NAME#16mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
-  def NAME#32mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
-  def NAME#64mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;
+  def 8mr    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+  def 16mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+  def 32mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+  def 64mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;
   let Predicates = [HasNDD, In64BitMode] in {
-    def NAME#8mr_ND    : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
-    def NAME#32mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>;
-    def NAME#64mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
+    def 8mr_ND    : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def 16mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
+    def 32mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>;
+    def 64mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
   }
   let Predicates = [In64BitMode] in {
-    def NAME#8mr_EVEX    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
-    def NAME#16mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
-    def NAME#32mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
-    def NAME#64mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+    def 8mr_EVEX    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+    def 16mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+    def 32mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+    def 64mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
   }
 
   // NOTE: These are order specific, we want the mi8 forms to be listed
   // first so that they are slightly preferred to the mi forms.
-  def NAME#8mi    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi8  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16;
-  def NAME#32mi8  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32;
+  def 8mi    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def 16mi8  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16;
+  def 32mi8  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
-    def NAME#64mi8  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>;
-  def NAME#16mi   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
-  def NAME#32mi   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+    def 64mi8  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>;
+  def 16mi   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def 32mi   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
-    def NAME#64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+    def 64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
 
   let Predicates = [HasNDD, In64BitMode] in {
-    def NAME#8mi_ND    : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-    def NAME#16mi8_ND  : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD;
-    def NAME#32mi8_ND  : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>;
-    def NAME#64mi8_ND  : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>;
-    def NAME#16mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
-    def NAME#32mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
-    def NAME#64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+    def 8mi_ND    : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def 16mi8_ND  : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD;
+    def 32mi8_ND  : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>;
+    def 64mi8_ND  : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>;
+    def 16mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
+    def 32mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
+    def 64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
   }
   let Predicates = [In64BitMode] in {
-    def NAME#8mi_EVEX    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL;
-    def NAME#16mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD;
-    def NAME#32mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL;
-    def NAME#64mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL;
-    def NAME#16mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD;
-    def NAME#32mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL;
-    def NAME#64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL;
+    def 8mi_EVEX    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL;
+    def 16mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD;
+    def 32mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL;
+    def 64mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL;
+    def 16mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD;
+    def 32mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL;
+    def 64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL;
   }
 
   // These are for the disassembler since 0x82 opcode behaves like 0x80, but
   // not in 64-bit mode.
   let Predicates = [Not64BitMode]  in {
-    def NAME#8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
-  def NAME#8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+    def 8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+  def 8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
   }
 
-  def NAME#8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,
+  def 8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,
                              "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,
+  def 16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,
                                "{$src, %ax|ax, $src}">, OpSize16;
-  def NAME#32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,
+  def 32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,
                                "{$src, %eax|eax, $src}">, OpSize32;
-  def NAME#64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,
+  def 64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,
                                "{$src, %rax|rax, $src}">;
 }
 
@@ -739,71 +739,71 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                         SDNode opnode, bit CommutableRR,
                         bit ConvertibleToThreeAddress> {
   let isCommutable = CommutableRR in {
-  def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+  def 8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-    def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
-    def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
-    def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+    def 16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+    def 32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+    def 64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
     } // isConvertibleToThreeAddress
   } // isCommutable
 
-  def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
-  def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
-  def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
-  def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+  def 8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+  def 16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+  def 32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+  def 64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
 
-  def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
-  def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
-  def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
-  def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+  def 8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+  def 16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
+  def 32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
+  def 64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
 
-  def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+  def 8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
 
   let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
   // NOTE: These are order specific, we want the ri8 forms to be listed
   // first so that they are slightly preferred to the ri forms.
-  def NAME#16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
-  def NAME#32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
-  def NAME#64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
+  def 16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+  def 32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+  def 64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
 
-  def NAME#16ri  : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
-  def NAME#32ri  : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
-  def NAME#64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
+  def 16ri  : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+  def 32ri  : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
+  def 64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
   }
 
-  def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
-  def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
-  def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
-  def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+  def 8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+  def 16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+  def 32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+  def 64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
 
   // NOTE: These are order specific, we want the mi8 forms to be listed
   // first so that they are slightly preferred to the mi forms.
-  def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16;
-  def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32;
+  def 16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16;
+  def 32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
-  def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
+  def 64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
 
-  def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi   : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
-  def NAME#32mi   : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+  def 8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def 16mi   : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def 32mi   : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
-  def NAME#64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
+  def 64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
 
   // These are for the disassembler since 0x82 opcode behaves like 0x80, but
   // not in 64-bit mode.
   let Predicates = [Not64BitMode] in {
-  def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+  def 8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
     let mayLoad = 1 in
-    def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
+    def 8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
   }
 
-  def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+  def 8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
                            "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+  def 16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
                            "{$src, %ax|ax, $src}">, OpSize16;
-  def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+  def 32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
                            "{$src, %eax|eax, $src}">, OpSize32;
-  def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+  def 64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
                            "{$src, %rax|rax, $src}">;
 }
 

From 2cc111e422a1e9a77455da3febf36449a6127bae Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 2 Jan 2024 19:00:58 -0800
Subject: [PATCH 097/313] Revert "[polly][ScheduleOptimizer] Fix long compile
 time(hang) reported in polly (#75141)"

This reverts commit d6c4d4c9b910e8ad5ed7cd4825a143742041c1f4.

Broke buildldbots with asserts disabled; -debug-only is only available in
asserts builds.
---
 polly/lib/Transform/ScheduleOptimizer.cpp     | 18 +---
 .../ScheduleOptimizer/schedule_computeout.ll  | 94 -------------------
 2 files changed, 1 insertion(+), 111 deletions(-)
 delete mode 100644 polly/test/ScheduleOptimizer/schedule_computeout.ll

diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 5a0ea3b406754..35a0a4def0403 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -96,13 +96,6 @@ static cl::opt<std::string>
                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
                       cl::init("yes"), cl::cat(PollyCategory));
 
-static cl::opt<int>
-    ScheduleComputeOut("polly-schedule-computeout",
-                       cl::desc("Bound the scheduler by maximal amount"
-                                "of computational steps. "),
-                       cl::Hidden, cl::init(300000), cl::ZeroOrMore,
-                       cl::cat(PollyCategory));
-
 static cl::opt<bool>
     GreedyFusion("polly-loopfusion-greedy",
                  cl::desc("Aggressively try to fuse everything"), cl::Hidden,
@@ -867,16 +860,7 @@ static void runIslScheduleOptimizer(
     SC = SC.set_proximity(Proximity);
     SC = SC.set_validity(Validity);
     SC = SC.set_coincidence(Validity);
-
-    {
-      IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut);
-      Schedule = SC.compute_schedule();
-
-      if (MaxOpGuard.hasQuotaExceeded())
-        LLVM_DEBUG(
-            dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
-    }
-
+    Schedule = SC.compute_schedule();
     isl_options_set_on_error(Ctx, OnErrorStatus);
 
     ScopsRescheduled++;
diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll
deleted file mode 100644
index db46ef5021932..0000000000000
--- a/polly/test/ScheduleOptimizer/schedule_computeout.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-; RUN: opt -S -polly-optree -polly-delicm  -polly-opt-isl -polly-schedule-computeout=100000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
-; Bailout if the computations of schedule compute exceeds the max scheduling quota.
-; Max compute out is initialized to 300000, Here it is set to 100000 for test purpose.
-
-@a = dso_local local_unnamed_addr global ptr null, align 8
-@b = dso_local local_unnamed_addr global ptr null, align 8
-@c = dso_local local_unnamed_addr global ptr null, align 8
-
-define dso_local void @foo(i32 noundef %I, i32 noundef %J, i32 noundef %K1, i32 noundef %K2, i32 noundef %L1, i32 noundef %L2) local_unnamed_addr {
-entry:
-  %j = alloca i32, align 4
-  store volatile i32 0, ptr %j, align 4
-  %j.0.j.0.j.0.54 = load volatile i32, ptr %j, align 4
-  %cmp55 = icmp slt i32 %j.0.j.0.j.0.54, %J
-  br i1 %cmp55, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  %0 = load ptr, ptr @a, align 8
-  %1 = load ptr, ptr @b, align 8
-  %2 = load ptr, ptr %1, align 8
-  %cmp352 = icmp slt i32 %L1, %L2
-  %cmp750 = icmp slt i32 %K1, %K2
-  %3 = sext i32 %K1 to i64
-  %4 = sext i32 %L1 to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup4, %entry
-  ret void
-
-for.body:                                         ; preds = %for.cond.cleanup4, %for.body.lr.ph
-  br i1 %cmp352, label %for.cond6.preheader.preheader, label %for.cond.cleanup4
-
-for.cond6.preheader.preheader:                    ; preds = %for.body
-  %wide.trip.count66 = sext i32 %L2 to i64
-  br label %for.cond6.preheader
-
-for.cond6.preheader:                              ; preds = %for.cond.cleanup8, %for.cond6.preheader.preheader
-  %indvars.iv61 = phi i64 [ %4, %for.cond6.preheader.preheader ], [ %indvars.iv.next62, %for.cond.cleanup8 ]
-  br i1 %cmp750, label %for.cond10.preheader.lr.ph, label %for.cond.cleanup8
-
-for.cond10.preheader.lr.ph:                       ; preds = %for.cond6.preheader
-  %5 = mul nsw i64 %indvars.iv61, 516
-  %6 = mul nsw i64 %indvars.iv61, 516
-  %wide.trip.count = sext i32 %K2 to i64
-  br label %for.cond10.preheader
-
-for.cond.cleanup4:                                ; preds = %for.cond.cleanup8, %for.body
-  %j.0.j.0.j.0.45 = load volatile i32, ptr %j, align 4
-  %inc34 = add nsw i32 %j.0.j.0.j.0.45, 1
-  store volatile i32 %inc34, ptr %j, align 4
-  %j.0.j.0.j.0. = load volatile i32, ptr %j, align 4
-  %cmp = icmp slt i32 %j.0.j.0.j.0., %J
-  br i1 %cmp, label %for.body, label %for.cond.cleanup
-
-for.cond10.preheader:                             ; preds = %for.cond.cleanup12, %for.cond10.preheader.lr.ph
-  %indvars.iv = phi i64 [ %3, %for.cond10.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup12 ]
-  %7 = getelementptr float, ptr %0, i64 %indvars.iv
-  %arrayidx18 = getelementptr float, ptr %7, i64 %5
-  %8 = load float, ptr %arrayidx18, align 4
-  br label %for.cond14.preheader
-
-for.cond.cleanup8:                                ; preds = %for.cond.cleanup12, %for.cond6.preheader
-  %indvars.iv.next62 = add nsw i64 %indvars.iv61, 1
-  %exitcond67.not = icmp eq i64 %indvars.iv.next62, %wide.trip.count66
-  br i1 %exitcond67.not, label %for.cond.cleanup4, label %for.cond6.preheader
-
-for.cond14.preheader:                             ; preds = %for.cond.cleanup16, %for.cond10.preheader
-  %m.049 = phi i32 [ -2, %for.cond10.preheader ], [ %inc21, %for.cond.cleanup16 ]
-  %sum.048 = phi float [ 0.000000e+00, %for.cond10.preheader ], [ %add19, %for.cond.cleanup16 ]
-  br label %for.body17
-
-for.cond.cleanup12:                               ; preds = %for.cond.cleanup16
-  %9 = getelementptr float, ptr %2, i64 %indvars.iv
-  %arrayidx26 = getelementptr float, ptr %9, i64 %6
-  store float %add19, ptr %arrayidx26, align 4
-  %indvars.iv.next = add nsw i64 %indvars.iv, 1
-  %exitcond60.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond60.not, label %for.cond.cleanup8, label %for.cond10.preheader
-
-for.cond.cleanup16:                               ; preds = %for.body17
-  %inc21 = add nsw i32 %m.049, 1
-  %exitcond56.not = icmp eq i32 %inc21, 3
-  br i1 %exitcond56.not, label %for.cond.cleanup12, label %for.cond14.preheader
-
-for.body17:                                       ; preds = %for.body17, %for.cond14.preheader
-  %n.047 = phi i32 [ -2, %for.cond14.preheader ], [ %inc, %for.body17 ]
-  %sum.146 = phi float [ %sum.048, %for.cond14.preheader ], [ %add19, %for.body17 ]
-  %add19 = fadd float %sum.146, %8
-  %inc = add nsw i32 %n.047, 1
-  %exitcond.not = icmp eq i32 %inc, 3
-  br i1 %exitcond.not, label %for.cond.cleanup16, label %for.body17
-}
-
-; CHECK: Schedule optimizer calculation exceeds ISL quota

From 8ae73fea3a2cbb072bf3e577dc49deb25b56e760 Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Wed, 3 Jan 2024 02:59:06 +0000
Subject: [PATCH 098/313] [PowerPC] Precommit test for #72845. NFC.

---
 .../CodeGen/PowerPC/expand-isel-to-branch.ll  | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/expand-isel-to-branch.ll

diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-to-branch.ll b/llvm/test/CodeGen/PowerPC/expand-isel-to-branch.ll
new file mode 100644
index 0000000000000..6f3e9f78b3175
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/expand-isel-to-branch.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix < %s | FileCheck %s
+
+define noundef signext i32 @ham(ptr nocapture noundef %arg) #0 {
+; CHECK-LABEL: ham:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    lwz 4, 0(3)
+; CHECK-NEXT:    cmpwi 4, 750
+; CHECK-NEXT:    addi 5, 4, 1
+; CHECK-NEXT:    li 4, 1
+; CHECK-NEXT:    bc 12, 0, L..BB0_1
+; CHECK-NEXT:    b L..BB0_2
+; CHECK-NEXT:  L..BB0_1: # %bb
+; CHECK-NEXT:    addi 4, 5, 0
+; CHECK-NEXT:  L..BB0_2: # %bb
+; CHECK-NEXT:    stw 4, 0(3)
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    blr
+bb:
+  %load = load i32, ptr %arg, align 4
+  %icmp = icmp slt i32 %load, 750
+  %add = add nsw i32 %load, 1
+  %select = select i1 %icmp, i32 %add, i32 1
+  store i32 %select, ptr %arg, align 4
+  ret i32 0
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crbits,+crypto,+direct-move,+extdiv,+htm,+isa-v206-instructions,+isa-v207-instructions,+power8-vector,+quadword-atomics,+vsx,-aix-small-local-exec-tls,-isa-v30-instructions,-isel,-power9-vector,-privileged,-rop-protect,-spe" }

From 4e347b4e38b95bc455d0e620e11ac58fc0172a94 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 2 Jan 2024 19:38:49 -0800
Subject: [PATCH 099/313] Revert "[RISCV][ISel] Combine scalable vector
 add/sub/mul with zero/sign extension (#72340)"

This reverts most of commit 5b155aea0e529b7b5c807e189fef6ea5cd5faec9.
I have left the new test file, but regenerated the checks.

This causes failures in our downstream testing. The input types
to the extends need to be checked so we don't create RISCVISD::VZEXT_VL
with illegal or unsupported input type.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 222 +++++-------------
 llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll    | 128 +++++-----
 .../RISCV/rvv/vscale-vw-web-simplification.ll |  34 +--
 3 files changed, 141 insertions(+), 243 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 03a59f8a8b57c..27bb69dc9868c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1374,8 +1374,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setPrefLoopAlignment(Subtarget.getPrefLoopAlignment());
 
   setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
-                       ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::MUL,
-                       ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
+                       ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND,
+                       ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
   if (Subtarget.is64Bit())
     setTargetDAGCombine(ISD::SRA);
 
@@ -12850,9 +12850,9 @@ struct CombineResult;
 
 /// Helper class for folding sign/zero extensions.
 /// In particular, this class is used for the following combines:
-/// add | add_vl -> vwadd(u) | vwadd(u)_w
-/// sub | sub_vl -> vwsub(u) | vwsub(u)_w
-/// mul | mul_vl -> vwmul(u) | vwmul_su
+/// add_vl -> vwadd(u) | vwadd(u)_w
+/// sub_vl -> vwsub(u) | vwsub(u)_w
+/// mul_vl -> vwmul(u) | vwmul_su
 ///
 /// An object of this class represents an operand of the operation we want to
 /// combine.
@@ -12897,8 +12897,6 @@ struct NodeExtensionHelper {
   /// E.g., for zext(a), this would return a.
   SDValue getSource() const {
     switch (OrigOperand.getOpcode()) {
-    case ISD::ZERO_EXTEND:
-    case ISD::SIGN_EXTEND:
     case RISCVISD::VSEXT_VL:
     case RISCVISD::VZEXT_VL:
       return OrigOperand.getOperand(0);
@@ -12915,8 +12913,7 @@ struct NodeExtensionHelper {
   /// Get or create a value that can feed \p Root with the given extension \p
   /// SExt. If \p SExt is std::nullopt, this returns the source of this operand.
   /// \see ::getSource().
-  SDValue getOrCreateExtendedOp(SDNode *Root, SelectionDAG &DAG,
-                                const RISCVSubtarget &Subtarget,
+  SDValue getOrCreateExtendedOp(const SDNode *Root, SelectionDAG &DAG,
                                 std::optional<bool> SExt) const {
     if (!SExt.has_value())
       return OrigOperand;
@@ -12931,10 +12928,8 @@ struct NodeExtensionHelper {
 
     // If we need an extension, we should be changing the type.
     SDLoc DL(Root);
-    auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget);
+    auto [Mask, VL] = getMaskAndVL(Root);
     switch (OrigOperand.getOpcode()) {
-    case ISD::ZERO_EXTEND:
-    case ISD::SIGN_EXTEND:
     case RISCVISD::VSEXT_VL:
     case RISCVISD::VZEXT_VL:
       return DAG.getNode(ExtOpc, DL, NarrowVT, Source, Mask, VL);
@@ -12974,15 +12969,12 @@ struct NodeExtensionHelper {
   /// \pre \p Opcode represents a supported root (\see ::isSupportedRoot()).
   static unsigned getSameExtensionOpcode(unsigned Opcode, bool IsSExt) {
     switch (Opcode) {
-    case ISD::ADD:
     case RISCVISD::ADD_VL:
     case RISCVISD::VWADD_W_VL:
     case RISCVISD::VWADDU_W_VL:
       return IsSExt ? RISCVISD::VWADD_VL : RISCVISD::VWADDU_VL;
-    case ISD::MUL:
     case RISCVISD::MUL_VL:
       return IsSExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
-    case ISD::SUB:
     case RISCVISD::SUB_VL:
     case RISCVISD::VWSUB_W_VL:
     case RISCVISD::VWSUBU_W_VL:
@@ -12995,8 +12987,7 @@ struct NodeExtensionHelper {
   /// Get the opcode to materialize \p Opcode(sext(a), zext(b)) ->
   /// newOpcode(a, b).
   static unsigned getSUOpcode(unsigned Opcode) {
-    assert((Opcode == RISCVISD::MUL_VL || Opcode == ISD::MUL) &&
-           "SU is only supported for MUL");
+    assert(Opcode == RISCVISD::MUL_VL && "SU is only supported for MUL");
     return RISCVISD::VWMULSU_VL;
   }
 
@@ -13004,10 +12995,8 @@ struct NodeExtensionHelper {
   /// newOpcode(a, b).
   static unsigned getWOpcode(unsigned Opcode, bool IsSExt) {
     switch (Opcode) {
-    case ISD::ADD:
     case RISCVISD::ADD_VL:
       return IsSExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL;
-    case ISD::SUB:
     case RISCVISD::SUB_VL:
       return IsSExt ? RISCVISD::VWSUB_W_VL : RISCVISD::VWSUBU_W_VL;
     default:
@@ -13017,33 +13006,19 @@ struct NodeExtensionHelper {
 
   using CombineToTry = std::function<std::optional<CombineResult>(
       SDNode * /*Root*/, const NodeExtensionHelper & /*LHS*/,
-      const NodeExtensionHelper & /*RHS*/, SelectionDAG &,
-      const RISCVSubtarget &)>;
+      const NodeExtensionHelper & /*RHS*/)>;
 
   /// Check if this node needs to be fully folded or extended for all users.
   bool needToPromoteOtherUsers() const { return EnforceOneUse; }
 
   /// Helper method to set the various fields of this struct based on the
   /// type of \p Root.
-  void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG,
-                              const RISCVSubtarget &Subtarget) {
+  void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG) {
     SupportsZExt = false;
     SupportsSExt = false;
     EnforceOneUse = true;
     CheckMask = true;
-    unsigned Opc = OrigOperand.getOpcode();
-    switch (Opc) {
-    case ISD::ZERO_EXTEND:
-    case ISD::SIGN_EXTEND: {
-      if (OrigOperand.getValueType().isVector()) {
-        SupportsZExt = Opc == ISD::ZERO_EXTEND;
-        SupportsSExt = Opc == ISD::SIGN_EXTEND;
-        SDLoc DL(Root);
-        MVT VT = Root->getSimpleValueType(0);
-        std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
-      }
-      break;
-    }
+    switch (OrigOperand.getOpcode()) {
     case RISCVISD::VZEXT_VL:
       SupportsZExt = true;
       Mask = OrigOperand.getOperand(1);
@@ -13099,16 +13074,8 @@ struct NodeExtensionHelper {
   }
 
   /// Check if \p Root supports any extension folding combines.
-  static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) {
+  static bool isSupportedRoot(const SDNode *Root) {
     switch (Root->getOpcode()) {
-    case ISD::ADD:
-    case ISD::SUB:
-    case ISD::MUL: {
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      if (!TLI.isTypeLegal(Root->getValueType(0)))
-        return false;
-      return Root->getValueType(0).isScalableVector();
-    }
     case RISCVISD::ADD_VL:
     case RISCVISD::MUL_VL:
     case RISCVISD::VWADD_W_VL:
@@ -13123,10 +13090,9 @@ struct NodeExtensionHelper {
   }
 
   /// Build a NodeExtensionHelper for \p Root.getOperand(\p OperandIdx).
-  NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG,
-                      const RISCVSubtarget &Subtarget) {
-    assert(isSupportedRoot(Root, DAG) && "Trying to build an helper with an "
-                                         "unsupported root");
+  NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG) {
+    assert(isSupportedRoot(Root) && "Trying to build an helper with an "
+                                    "unsupported root");
     assert(OperandIdx < 2 && "Requesting something else than LHS or RHS");
     OrigOperand = Root->getOperand(OperandIdx);
 
@@ -13142,7 +13108,7 @@ struct NodeExtensionHelper {
         SupportsZExt =
             Opc == RISCVISD::VWADDU_W_VL || Opc == RISCVISD::VWSUBU_W_VL;
         SupportsSExt = !SupportsZExt;
-        std::tie(Mask, VL) = getMaskAndVL(Root, DAG, Subtarget);
+        std::tie(Mask, VL) = getMaskAndVL(Root);
         CheckMask = true;
         // There's no existing extension here, so we don't have to worry about
         // making sure it gets removed.
@@ -13151,7 +13117,7 @@ struct NodeExtensionHelper {
       }
       [[fallthrough]];
     default:
-      fillUpExtensionSupport(Root, DAG, Subtarget);
+      fillUpExtensionSupport(Root, DAG);
       break;
     }
   }
@@ -13167,27 +13133,14 @@ struct NodeExtensionHelper {
   }
 
   /// Helper function to get the Mask and VL from \p Root.
-  static std::pair<SDValue, SDValue>
-  getMaskAndVL(const SDNode *Root, SelectionDAG &DAG,
-               const RISCVSubtarget &Subtarget) {
-    assert(isSupportedRoot(Root, DAG) && "Unexpected root");
-    switch (Root->getOpcode()) {
-    case ISD::ADD:
-    case ISD::SUB:
-    case ISD::MUL: {
-      SDLoc DL(Root);
-      MVT VT = Root->getSimpleValueType(0);
-      return getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
-    }
-    default:
-      return std::make_pair(Root->getOperand(3), Root->getOperand(4));
-    }
+  static std::pair<SDValue, SDValue> getMaskAndVL(const SDNode *Root) {
+    assert(isSupportedRoot(Root) && "Unexpected root");
+    return std::make_pair(Root->getOperand(3), Root->getOperand(4));
   }
 
   /// Check if the Mask and VL of this operand are compatible with \p Root.
-  bool areVLAndMaskCompatible(SDNode *Root, SelectionDAG &DAG,
-                              const RISCVSubtarget &Subtarget) const {
-    auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget);
+  bool areVLAndMaskCompatible(const SDNode *Root) const {
+    auto [Mask, VL] = getMaskAndVL(Root);
     return isMaskCompatible(Mask) && isVLCompatible(VL);
   }
 
@@ -13195,14 +13148,11 @@ struct NodeExtensionHelper {
   /// foldings that are supported by this class.
   static bool isCommutative(const SDNode *N) {
     switch (N->getOpcode()) {
-    case ISD::ADD:
-    case ISD::MUL:
     case RISCVISD::ADD_VL:
     case RISCVISD::MUL_VL:
     case RISCVISD::VWADD_W_VL:
     case RISCVISD::VWADDU_W_VL:
       return true;
-    case ISD::SUB:
     case RISCVISD::SUB_VL:
     case RISCVISD::VWSUB_W_VL:
     case RISCVISD::VWSUBU_W_VL:
@@ -13247,25 +13197,14 @@ struct CombineResult {
   /// Return a value that uses TargetOpcode and that can be used to replace
   /// Root.
   /// The actual replacement is *not* done in that method.
-  SDValue materialize(SelectionDAG &DAG,
-                      const RISCVSubtarget &Subtarget) const {
+  SDValue materialize(SelectionDAG &DAG) const {
     SDValue Mask, VL, Merge;
-    std::tie(Mask, VL) =
-        NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget);
-    switch (Root->getOpcode()) {
-    default:
-      Merge = Root->getOperand(2);
-      break;
-    case ISD::ADD:
-    case ISD::SUB:
-    case ISD::MUL:
-      Merge = DAG.getUNDEF(Root->getValueType(0));
-      break;
-    }
+    std::tie(Mask, VL) = NodeExtensionHelper::getMaskAndVL(Root);
+    Merge = Root->getOperand(2);
     return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0),
-                       LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtLHS),
-                       RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtRHS),
-                       Merge, Mask, VL);
+                       LHS.getOrCreateExtendedOp(Root, DAG, SExtLHS),
+                       RHS.getOrCreateExtendedOp(Root, DAG, SExtRHS), Merge,
+                       Mask, VL);
   }
 };
 
@@ -13282,16 +13221,15 @@ struct CombineResult {
 static std::optional<CombineResult>
 canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
                                  const NodeExtensionHelper &RHS, bool AllowSExt,
-                                 bool AllowZExt, SelectionDAG &DAG,
-                                 const RISCVSubtarget &Subtarget) {
+                                 bool AllowZExt) {
   assert((AllowSExt || AllowZExt) && "Forgot to set what you want?");
-  if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) ||
-      !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
+  if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))
     return std::nullopt;
   if (AllowZExt && LHS.SupportsZExt && RHS.SupportsZExt)
     return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(
                              Root->getOpcode(), /*IsSExt=*/false),
-                         Root, LHS, /*SExtLHS=*/false, RHS, /*SExtRHS=*/false);
+                         Root, LHS, /*SExtLHS=*/false, RHS,
+                         /*SExtRHS=*/false);
   if (AllowSExt && LHS.SupportsSExt && RHS.SupportsSExt)
     return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(
                              Root->getOpcode(), /*IsSExt=*/true),
@@ -13308,10 +13246,9 @@ canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,
-                             const NodeExtensionHelper &RHS, SelectionDAG &DAG,
-                             const RISCVSubtarget &Subtarget) {
+                             const NodeExtensionHelper &RHS) {
   return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true,
-                                          /*AllowZExt=*/true, DAG, Subtarget);
+                                          /*AllowZExt=*/true);
 }
 
 /// Check if \p Root follows a pattern Root(LHS, ext(RHS))
@@ -13320,9 +13257,8 @@ canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,
-              const NodeExtensionHelper &RHS, SelectionDAG &DAG,
-              const RISCVSubtarget &Subtarget) {
-  if (!RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
+              const NodeExtensionHelper &RHS) {
+  if (!RHS.areVLAndMaskCompatible(Root))
     return std::nullopt;
 
   // FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar
@@ -13346,10 +13282,9 @@ canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,
-                    const NodeExtensionHelper &RHS, SelectionDAG &DAG,
-                    const RISCVSubtarget &Subtarget) {
+                    const NodeExtensionHelper &RHS) {
   return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true,
-                                          /*AllowZExt=*/false, DAG, Subtarget);
+                                          /*AllowZExt=*/false);
 }
 
 /// Check if \p Root follows a pattern Root(zext(LHS), zext(RHS))
@@ -13358,10 +13293,9 @@ canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,
-                    const NodeExtensionHelper &RHS, SelectionDAG &DAG,
-                    const RISCVSubtarget &Subtarget) {
+                    const NodeExtensionHelper &RHS) {
   return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/false,
-                                          /*AllowZExt=*/true, DAG, Subtarget);
+                                          /*AllowZExt=*/true);
 }
 
 /// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS))
@@ -13370,13 +13304,10 @@ canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVW_SU(SDNode *Root, const NodeExtensionHelper &LHS,
-               const NodeExtensionHelper &RHS, SelectionDAG &DAG,
-               const RISCVSubtarget &Subtarget) {
-
+               const NodeExtensionHelper &RHS) {
   if (!LHS.SupportsSExt || !RHS.SupportsZExt)
     return std::nullopt;
-  if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) ||
-      !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
+  if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))
     return std::nullopt;
   return CombineResult(NodeExtensionHelper::getSUOpcode(Root->getOpcode()),
                        Root, LHS, /*SExtLHS=*/true, RHS, /*SExtRHS=*/false);
@@ -13386,8 +13317,6 @@ SmallVector<NodeExtensionHelper::CombineToTry>
 NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
   SmallVector<CombineToTry> Strategies;
   switch (Root->getOpcode()) {
-  case ISD::ADD:
-  case ISD::SUB:
   case RISCVISD::ADD_VL:
   case RISCVISD::SUB_VL:
     // add|sub -> vwadd(u)|vwsub(u)
@@ -13395,7 +13324,6 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
     // add|sub -> vwadd(u)_w|vwsub(u)_w
     Strategies.push_back(canFoldToVW_W);
     break;
-  case ISD::MUL:
   case RISCVISD::MUL_VL:
     // mul -> vwmul(u)
     Strategies.push_back(canFoldToVWWithSameExtension);
@@ -13426,14 +13354,12 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
 /// mul_vl -> vwmul(u) | vwmul_su
 /// vwadd_w(u) -> vwadd(u)
 /// vwub_w(u) -> vwadd(u)
-static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           const RISCVSubtarget &Subtarget) {
+static SDValue
+combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
 
-  if (!NodeExtensionHelper::isSupportedRoot(N, DAG))
-    return SDValue();
-
+  assert(NodeExtensionHelper::isSupportedRoot(N) &&
+         "Shouldn't have called this method");
   SmallVector<SDNode *> Worklist;
   SmallSet<SDNode *, 8> Inserted;
   Worklist.push_back(N);
@@ -13442,11 +13368,11 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
 
   while (!Worklist.empty()) {
     SDNode *Root = Worklist.pop_back_val();
-    if (!NodeExtensionHelper::isSupportedRoot(Root, DAG))
+    if (!NodeExtensionHelper::isSupportedRoot(Root))
       return SDValue();
 
-    NodeExtensionHelper LHS(N, 0, DAG, Subtarget);
-    NodeExtensionHelper RHS(N, 1, DAG, Subtarget);
+    NodeExtensionHelper LHS(N, 0, DAG);
+    NodeExtensionHelper RHS(N, 1, DAG);
     auto AppendUsersIfNeeded = [&Worklist,
                                 &Inserted](const NodeExtensionHelper &Op) {
       if (Op.needToPromoteOtherUsers()) {
@@ -13473,8 +13399,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
 
       for (NodeExtensionHelper::CombineToTry FoldingStrategy :
            FoldingStrategies) {
-        std::optional<CombineResult> Res =
-            FoldingStrategy(N, LHS, RHS, DAG, Subtarget);
+        std::optional<CombineResult> Res = FoldingStrategy(N, LHS, RHS);
         if (Res) {
           Matched = true;
           CombinesToApply.push_back(*Res);
@@ -13503,7 +13428,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
   SmallVector<std::pair<SDValue, SDValue>> ValuesToReplace;
   ValuesToReplace.reserve(CombinesToApply.size());
   for (CombineResult Res : CombinesToApply) {
-    SDValue NewValue = Res.materialize(DAG, Subtarget);
+    SDValue NewValue = Res.materialize(DAG);
     if (!InputRootReplacement) {
       assert(Res.Root == N &&
              "First element is expected to be the current node");
@@ -14775,20 +14700,13 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
 
 static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
                                const RISCVSubtarget &Subtarget) {
-
-  assert(N->getOpcode() == RISCVISD::ADD_VL || N->getOpcode() == ISD::ADD);
-
-  if (N->getValueType(0).isFixedLengthVector())
-    return SDValue();
-
+  assert(N->getOpcode() == RISCVISD::ADD_VL);
   SDValue Addend = N->getOperand(0);
   SDValue MulOp = N->getOperand(1);
+  SDValue AddMergeOp = N->getOperand(2);
 
-  if (N->getOpcode() == RISCVISD::ADD_VL) {
-    SDValue AddMergeOp = N->getOperand(2);
-    if (!AddMergeOp.isUndef())
-      return SDValue();
-  }
+  if (!AddMergeOp.isUndef())
+    return SDValue();
 
   auto IsVWMulOpc = [](unsigned Opc) {
     switch (Opc) {
@@ -14812,16 +14730,8 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
   if (!MulMergeOp.isUndef())
     return SDValue();
 
-  auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG,
-                             const RISCVSubtarget &Subtarget) {
-    if (N->getOpcode() == ISD::ADD) {
-      SDLoc DL(N);
-      return getDefaultScalableVLOps(N->getSimpleValueType(0), DL, DAG,
-                                     Subtarget);
-    }
-    return std::make_pair(N->getOperand(3), N->getOperand(4));
-  }(N, DAG, Subtarget);
-
+  SDValue AddMask = N->getOperand(3);
+  SDValue AddVL = N->getOperand(4);
   SDValue MulMask = MulOp.getOperand(3);
   SDValue MulVL = MulOp.getOperand(4);
 
@@ -15087,18 +14997,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     return DAG.getNode(ISD::AND, DL, VT, NewFMV,
                        DAG.getConstant(~SignBit, DL, VT));
   }
-  case ISD::ADD: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
-      return V;
-    if (SDValue V = combineToVWMACC(N, DAG, Subtarget))
-      return V;
+  case ISD::ADD:
     return performADDCombine(N, DAG, Subtarget);
-  }
-  case ISD::SUB: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
-      return V;
+  case ISD::SUB:
     return performSUBCombine(N, DAG, Subtarget);
-  }
   case ISD::AND:
     return performANDCombine(N, DCI, Subtarget);
   case ISD::OR:
@@ -15106,8 +15008,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::XOR:
     return performXORCombine(N, DAG, Subtarget);
   case ISD::MUL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
-      return V;
     return performMULCombine(N, DAG);
   case ISD::FADD:
   case ISD::UMAX:
@@ -15584,7 +15484,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::ADD_VL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI))
       return V;
     return combineToVWMACC(N, DAG, Subtarget);
   case RISCVISD::SUB_VL:
@@ -15593,7 +15493,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::VWSUB_W_VL:
   case RISCVISD::VWSUBU_W_VL:
   case RISCVISD::MUL_VL:
-    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+    return combineBinOp_VLToVWBinOp_VL(N, DCI);
   case RISCVISD::VFMADD_VL:
   case RISCVISD::VFNMADD_VL:
   case RISCVISD::VFMSUB_VL:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index fc94f8c2a5279..47d65c2593a4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1231,17 +1231,16 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv1i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v9, a0
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
-; CHECK-F-NEXT:    vwsubu.wv v9, v9, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v9, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v9, a1
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v9, a1
+; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1372,17 +1371,16 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv2i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v10, a0
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
-; CHECK-F-NEXT:    vwsubu.wv v10, v10, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v10, a1
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v10, a1
+; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1513,17 +1511,16 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv4i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v12, a0
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
-; CHECK-F-NEXT:    vwsubu.wv v12, v12, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v12, a1
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v12, a1
+; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1654,17 +1651,16 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv8i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v16, a0
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v24, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v24, 23
-; CHECK-F-NEXT:    vwsubu.wv v16, v16, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v16, a1
+; CHECK-F-NEXT:    vzext.vf2 v16, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v16, a1
+; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -2837,16 +2833,15 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v9, a0
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
-; CHECK-F-NEXT:    vwsubu.wv v9, v9, v8
+; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v9, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v9, a1
 ; CHECK-F-NEXT:    fsrm a0
-; CHECK-F-NEXT:    vmv1r.v v8, v9
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64:
@@ -2973,16 +2968,15 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v10, a0
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
-; CHECK-F-NEXT:    vwsubu.wv v10, v10, v8
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v10, a1
 ; CHECK-F-NEXT:    fsrm a0
-; CHECK-F-NEXT:    vmv2r.v v8, v10
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64:
@@ -3109,16 +3103,15 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v12, a0
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
-; CHECK-F-NEXT:    vwsubu.wv v12, v12, v8
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v12, a1
 ; CHECK-F-NEXT:    fsrm a0
-; CHECK-F-NEXT:    vmv4r.v v8, v12
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64:
@@ -3245,15 +3238,14 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vmv8r.v v16, v8
-; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v8, a0
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v24, v16
-; CHECK-F-NEXT:    vsrl.vi v16, v24, 23
-; CHECK-F-NEXT:    vwsubu.wv v8, v8, v16
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v16, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v16, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index fe605d5ca6f99..d99e3a7fe690a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -10,12 +10,14 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING
 
+; FIXME: We should use vwadd/vwsub/vwmul instructions.
 
 ; Check that the scalable vector add/sub/mul operations are all promoted into their
 ; vw counterpart when the folding of the web size is increased to 3.
 ; We need the web size to be at least 3 for the folding to happen, because
 ; %c has 3 uses.
 ; see https://github.com/llvm/llvm-project/pull/72340
+; FIXME: We don't currently use widening instructions.
 define <vscale x 2 x i16> @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_sext_multiple_users:
 ; NO_FOLDING:       # %bb.0:
@@ -35,16 +37,18 @@ define <vscale x 2 x i16> @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr %
 ;
 ; FOLDING-LABEL: vwop_vscale_sext_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; FOLDING-NEXT:    vle8.v v8, (a0)
 ; FOLDING-NEXT:    vle8.v v9, (a1)
 ; FOLDING-NEXT:    vle8.v v10, (a2)
-; FOLDING-NEXT:    vwmul.vv v11, v8, v9
-; FOLDING-NEXT:    vwadd.vv v9, v8, v10
-; FOLDING-NEXT:    vwsub.vv v12, v8, v10
-; FOLDING-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; FOLDING-NEXT:    vor.vv v8, v11, v9
-; FOLDING-NEXT:    vor.vv v8, v8, v12
+; FOLDING-NEXT:    vsext.vf2 v11, v8
+; FOLDING-NEXT:    vsext.vf2 v8, v9
+; FOLDING-NEXT:    vsext.vf2 v9, v10
+; FOLDING-NEXT:    vmul.vv v8, v11, v8
+; FOLDING-NEXT:    vadd.vv v10, v11, v9
+; FOLDING-NEXT:    vsub.vv v9, v11, v9
+; FOLDING-NEXT:    vor.vv v8, v8, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v9
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i8>, ptr %x
   %b = load <vscale x 2 x i8>, ptr %y
@@ -81,16 +85,18 @@ define <vscale x 2 x i16> @vwop_vscale_zext_multiple_users(ptr %x, ptr %y, ptr %
 ;
 ; FOLDING-LABEL: vwop_vscale_zext_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; FOLDING-NEXT:    vle8.v v8, (a0)
 ; FOLDING-NEXT:    vle8.v v9, (a1)
 ; FOLDING-NEXT:    vle8.v v10, (a2)
-; FOLDING-NEXT:    vwmulu.vv v11, v8, v9
-; FOLDING-NEXT:    vwaddu.vv v9, v8, v10
-; FOLDING-NEXT:    vwsubu.vv v12, v8, v10
-; FOLDING-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; FOLDING-NEXT:    vor.vv v8, v11, v9
-; FOLDING-NEXT:    vor.vv v8, v8, v12
+; FOLDING-NEXT:    vzext.vf2 v11, v8
+; FOLDING-NEXT:    vzext.vf2 v8, v9
+; FOLDING-NEXT:    vzext.vf2 v9, v10
+; FOLDING-NEXT:    vmul.vv v8, v11, v8
+; FOLDING-NEXT:    vadd.vv v10, v11, v9
+; FOLDING-NEXT:    vsub.vv v9, v11, v9
+; FOLDING-NEXT:    vor.vv v8, v8, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v9
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i8>, ptr %x
   %b = load <vscale x 2 x i8>, ptr %y

From 16124a3946c7f26fa4b25a5497ab68cc417950b3 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Tue, 2 Jan 2024 19:58:26 -0800
Subject: [PATCH 100/313] [libc] `__stack_chk_fail` baremetal dependencies
 (#76412)

`__stack_chk_fail` uses `write_to_stderr` and `abort` but these
currently aren't included in baremetal targets resulting in a CMake
build error:

```
CMake Error at /llvm-project/libc/cmake/modules/LLVMLibCObjectRules.cmake:693 (target_link_libraries):
  Target "libc.src.stdlib.abort" of type UTILITY may not be linked into
  another target.  One may link only to INTERFACE, OBJECT, STATIC or SHARED
  libraries, or to executables with the ENABLE_EXPORTS property set.
Call Stack (most recent call first):
  /llvm-project/libc/cmake/modules/LLVMLibCObjectRules.cmake:811 (create_entrypoint_object)
  /llvm-project/libc/cmake/modules/LLVMLibCObjectRules.cmake:891 (expand_flags_for_entrypoint_object)
  /llvm-project/libc/src/compiler/generic/CMakeLists.txt:1 (add_entrypoint_object)

CMake Error at /llvm-project/libc/cmake/modules/LLVMLibCLibraryRules.cmake:5 (get_target_property):
  get_target_property() called with non-existent target
  "libc.src.__support.OSUtil.osutil".
Call Stack (most recent call first):
  /llvm-project/libc/cmake/modules/LLVMLibCLibraryRules.cmake:36 (collect_object_file_deps)
  /llvm-project/libc/cmake/modules/LLVMLibCLibraryRules.cmake:36 (collect_object_file_deps)
  /llvm-project/libc/cmake/modules/LLVMLibCLibraryRules.cmake:85 (collect_object_file_deps)
  /llvm-project/libc/lib/CMakeLists.txt:26 (add_entrypoint_library)
```

To address these errors, we need to include `abort` in the list of
baremetal entrypoints. We also need to provide `write_to_stderr`
baremetal implementation, but this is challenging since there is no
uniform way to print a message on these platforms (sometimes there may
not be any way to print a message). We instead defer to
`__libc_log_write` which can be provided by the underlying platform. We
use a similar approach for `quick_exit`, defering to
`__libc_quick_exit`.
---
 libc/config/baremetal/arm/entrypoints.txt     |  1 +
 libc/config/baremetal/riscv/entrypoints.txt   |  1 +
 .../__support/OSUtil/baremetal/CMakeLists.txt |  9 +++++++
 libc/src/__support/OSUtil/baremetal/io.h      | 25 +++++++++++++++++++
 .../__support/OSUtil/baremetal/quick_exit.h   | 23 +++++++++++++++++
 libc/src/__support/OSUtil/io.h                |  3 +++
 libc/src/__support/OSUtil/quick_exit.h        |  3 +++
 7 files changed, 65 insertions(+)
 create mode 100644 libc/src/__support/OSUtil/baremetal/CMakeLists.txt
 create mode 100644 libc/src/__support/OSUtil/baremetal/io.h
 create mode 100644 libc/src/__support/OSUtil/baremetal/quick_exit.h

diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index a0779c41652ae..54f74c6a973ce 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -74,6 +74,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.vsnprintf
 
     # stdlib.h entrypoints
+    libc.src.stdlib.abort
     libc.src.stdlib.abs
     libc.src.stdlib.atoi
     libc.src.stdlib.atof
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 3e15cc8901bdd..5da755170fda9 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -74,6 +74,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.vsnprintf
 
     # stdlib.h entrypoints
+    libc.src.stdlib.abort
     libc.src.stdlib.abs
     libc.src.stdlib.atoi
     libc.src.stdlib.atol
diff --git a/libc/src/__support/OSUtil/baremetal/CMakeLists.txt b/libc/src/__support/OSUtil/baremetal/CMakeLists.txt
new file mode 100644
index 0000000000000..280ff87cf1470
--- /dev/null
+++ b/libc/src/__support/OSUtil/baremetal/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_header_library(
+  baremetal_util
+  HDRS
+    io.h
+    quick_exit.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.CPP.string_view
+)
diff --git a/libc/src/__support/OSUtil/baremetal/io.h b/libc/src/__support/OSUtil/baremetal/io.h
new file mode 100644
index 0000000000000..4eeef5586f74f
--- /dev/null
+++ b/libc/src/__support/OSUtil/baremetal/io.h
@@ -0,0 +1,25 @@
+//===---------- Baremetal implementation of IO utils ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_BAREMETAL_IO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_BAREMETAL_IO_H
+
+#include "src/__support/CPP/string_view.h"
+
+namespace LIBC_NAMESPACE {
+
+// This is intended to be provided by the vendor.
+extern "C" void __llvm_libc_log_write(const char* msg, size_t len);
+
+void write_to_stderr(cpp::string_view msg) {
+  __llvm_libc_log_write(msg.data(), msg.size());
+}
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_BAREMETAL_IO_H
diff --git a/libc/src/__support/OSUtil/baremetal/quick_exit.h b/libc/src/__support/OSUtil/baremetal/quick_exit.h
new file mode 100644
index 0000000000000..cc530bc0f4e9d
--- /dev/null
+++ b/libc/src/__support/OSUtil/baremetal/quick_exit.h
@@ -0,0 +1,23 @@
+//===----- Baremetal implementation of a quick exit function ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_BAREMETAL_QUICK_EXIT_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_BAREMETAL_QUICK_EXIT_H
+
+namespace LIBC_NAMESPACE {
+
+// This is intended to be provided by the vendor.
+extern "C" void __llvm_libc_quick_exit(int status);
+
+void quick_exit(int status) {
+  __llvm_libc_quick_exit(status);
+}
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_BAREMETAL_QUICK_EXIT_H
diff --git a/libc/src/__support/OSUtil/io.h b/libc/src/__support/OSUtil/io.h
index fc9d7f3ed38a1..cb7e748fc6442 100644
--- a/libc/src/__support/OSUtil/io.h
+++ b/libc/src/__support/OSUtil/io.h
@@ -19,6 +19,9 @@
 #include "linux/io.h"
 #elif defined(__Fuchsia__)
 #include "fuchsia/io.h"
+#elif defined(__ELF__)
+// TODO: Ideally we would have LIBC_TARGET_OS_IS_BAREMETAL.
+#include "baremetal/io.h"
 #endif
 
 #endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_IO_H
diff --git a/libc/src/__support/OSUtil/quick_exit.h b/libc/src/__support/OSUtil/quick_exit.h
index 4329df8ecef05..6c59c1afcda25 100644
--- a/libc/src/__support/OSUtil/quick_exit.h
+++ b/libc/src/__support/OSUtil/quick_exit.h
@@ -17,6 +17,9 @@
 #include "darwin/quick_exit.h"
 #elif defined(__linux__)
 #include "linux/quick_exit.h"
+#elif defined(__ELF__)
+// TODO: Ideally we would have LIBC_TARGET_OS_IS_BAREMETAL.
+#include "baremetal/quick_exit.h"
 #endif
 
 #endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_QUICK_EXIT_H

From 214b434dc8e7b414f280e583943d267c857ce19a Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Wed, 3 Jan 2024 05:10:15 +0000
Subject: [PATCH 101/313] [NFC][libc] Fix formatting for baremetal OSUtils

This reformats the headers to address formatting issues.
---
 libc/src/__support/OSUtil/baremetal/io.h         | 2 +-
 libc/src/__support/OSUtil/baremetal/quick_exit.h | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/libc/src/__support/OSUtil/baremetal/io.h b/libc/src/__support/OSUtil/baremetal/io.h
index 4eeef5586f74f..a50c11d4aea12 100644
--- a/libc/src/__support/OSUtil/baremetal/io.h
+++ b/libc/src/__support/OSUtil/baremetal/io.h
@@ -14,7 +14,7 @@
 namespace LIBC_NAMESPACE {
 
 // This is intended to be provided by the vendor.
-extern "C" void __llvm_libc_log_write(const char* msg, size_t len);
+extern "C" void __llvm_libc_log_write(const char *msg, size_t len);
 
 void write_to_stderr(cpp::string_view msg) {
   __llvm_libc_log_write(msg.data(), msg.size());
diff --git a/libc/src/__support/OSUtil/baremetal/quick_exit.h b/libc/src/__support/OSUtil/baremetal/quick_exit.h
index cc530bc0f4e9d..74f9142e21b81 100644
--- a/libc/src/__support/OSUtil/baremetal/quick_exit.h
+++ b/libc/src/__support/OSUtil/baremetal/quick_exit.h
@@ -14,9 +14,7 @@ namespace LIBC_NAMESPACE {
 // This is intended to be provided by the vendor.
 extern "C" void __llvm_libc_quick_exit(int status);
 
-void quick_exit(int status) {
-  __llvm_libc_quick_exit(status);
-}
+void quick_exit(int status) { __llvm_libc_quick_exit(status); }
 
 } // namespace LIBC_NAMESPACE
 

From ed3e007a8759508973f9c67209958e219e515bf8 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Tue, 2 Jan 2024 21:31:26 -0800
Subject: [PATCH 102/313] [CMake] Create generic runtimes targets as needed
 (#76096)

We currently create a generic runtime target for all subbuilds. For
example if we're building libcxx for aarch64-linux-gnu and
x86_64-linux-gnu, we would create the cxx target that would depend on
cxx-aarch64-linux-gnu and cxx-x86_64-linux-gnu. The current
implementation creates the generic runtimes targets ahead of time which
introduces an issue where different subbuilds enable different runtimes.
We should instead create the generic targets as needed.
---
 llvm/runtimes/CMakeLists.txt | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 77254b7eb5e62..db50b6d1e8f8c 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -391,8 +391,17 @@ function(runtime_register_target name)
     add_dependencies(runtimes-test-depends runtimes-test-depends-${name})
   endif()
   foreach(runtime_name ${runtime_names})
+    if(NOT TARGET ${runtime_name})
+      add_custom_target(${runtime_name})
+    endif()
     add_dependencies(${runtime_name} ${runtime_name}-${name})
+    if(NOT TARGET install-${runtime_name})
+      add_custom_target(install-${runtime_name})
+    endif()
     add_dependencies(install-${runtime_name} install-${runtime_name}-${name})
+    if(NOT TARGET install-${runtime_name}-stripped)
+      add_custom_target(install-${runtime_name}-stripped)
+    endif()
     add_dependencies(install-${runtime_name}-stripped install-${runtime_name}-${name}-stripped)
   endforeach()
   foreach(component ${LLVM_RUNTIME_DISTRIBUTION_COMPONENTS})
@@ -459,11 +468,6 @@ if(runtimes)
         add_custom_target(runtimes-test-depends)
         set(test_targets "")
       endif()
-      foreach(runtime_name ${RUNTIME_NAMES})
-        add_custom_target(${runtime_name})
-        add_custom_target(install-${runtime_name})
-        add_custom_target(install-${runtime_name}-stripped)
-      endforeach()
       if(LLVM_RUNTIME_DISTRIBUTION_COMPONENTS)
         foreach(component ${LLVM_RUNTIME_DISTRIBUTION_COMPONENTS})
           add_custom_target(${component})

From 923f6ac018e3dd1c86bd4cee05e95680962e1446 Mon Sep 17 00:00:00 2001
From: brendaso1 <119626341+brendaso1@users.noreply.github.com>
Date: Wed, 3 Jan 2024 13:49:05 +0800
Subject: [PATCH 103/313] [FastISel][AArch64] Compare Instruction
 Miscompilation Fix (#75993)

When shl is folded in compare instruction, a miscompilation occurs when
the CMP instruction is also sign-extended. For the following IR:

  %op3 = shl i8 %op2, 3
  %tmp3 = icmp eq i8 %tmp2, %op3

It used to generate

   cmp w8, w9, sxtb #3

which means sign extend w9, shift left by 3, and then compare with the
value in w8. However, the original intention of the IR would require
`%op2` to first shift left before extending the operands in the
comparison operation . Moreover, if sign extension is used instead of
zero extension, the sample test would miscompile. This PR creates a fix
for the issue, more specifically to not fold the left shift into the CMP
instruction, and to create a zero-extended value rather than a
sign-extended value.
---
 llvm/lib/Target/AArch64/AArch64FastISel.cpp      |  9 ---------
 .../test/CodeGen/AArch64/arm64-fast-isel-icmp.ll | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 9b8162ce8dd4d..1ea63a5d6ec08 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -1231,15 +1231,6 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
   // Only extend the RHS within the instruction if there is a valid extend type.
   if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&
       isValueAvailable(RHS)) {
-    if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
-      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
-        if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
-          Register RHSReg = getRegForValue(SI->getOperand(0));
-          if (!RHSReg)
-            return 0;
-          return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType,
-                               C->getZExtValue(), SetFlags, WantResult);
-        }
     Register RHSReg = getRegForValue(RHS);
     if (!RHSReg)
       return 0;
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
index f853c0802cb1b..ac08825c23762 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function icmp_i8_shift_and_cmp --version 4
 ; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
@@ -257,3 +258,18 @@ entry:
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
 }
+
+define i32 @icmp_i8_shift_and_cmp(i8 %a, i8 %b) {
+entry:
+; CHECK-LABEL: icmp_i8_shift_and_cmp:
+; CHECK:       ubfiz [[REG1:w[0-9]+]], w0, #3, #5
+; CHECK-NEXT:  sxtb [[REG0:w[0-9]+]], w1
+; CHECK-NEXT:  cmp [[REG0]], [[REG1]], sxtb
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
+  %op = shl i8 %a, 3
+  %cmp = icmp eq i8 %b, %op
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+

From bbd57e18326b4c58072a113190afaadd147c679e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 2 Jan 2024 21:58:00 -0800
Subject: [PATCH 104/313] [SelectionDAG] Add initial plumbing for the disjoint
 flag. (#76751)

This copies the flag from IR to the SDNode in SelectionDAGBuilder, clears
the flag in SimplifyDemandedBits, and adds it to canCreateUndefOrPoison.

Uses of the flag will come in later patches.
---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h    | 13 +++++++++----
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp   |  5 ++++-
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp |  2 ++
 .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp  |  3 +++
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 16 +++++++++++++---
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 5c44538fe6997..7f957878343a6 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -381,6 +381,7 @@ struct SDNodeFlags {
   bool NoUnsignedWrap : 1;
   bool NoSignedWrap : 1;
   bool Exact : 1;
+  bool Disjoint : 1;
   bool NonNeg : 1;
   bool NoNaNs : 1;
   bool NoInfs : 1;
@@ -402,10 +403,11 @@ struct SDNodeFlags {
 public:
   /// Default constructor turns off all optimization flags.
   SDNodeFlags()
-      : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), NonNeg(false),
-        NoNaNs(false), NoInfs(false), NoSignedZeros(false),
-        AllowReciprocal(false), AllowContract(false), ApproximateFuncs(false),
-        AllowReassociation(false), NoFPExcept(false), Unpredictable(false) {}
+      : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false),
+        Disjoint(false), NonNeg(false), NoNaNs(false), NoInfs(false),
+        NoSignedZeros(false), AllowReciprocal(false), AllowContract(false),
+        ApproximateFuncs(false), AllowReassociation(false), NoFPExcept(false),
+        Unpredictable(false) {}
 
   /// Propagate the fast-math-flags from an IR FPMathOperator.
   void copyFMF(const FPMathOperator &FPMO) {
@@ -422,6 +424,7 @@ struct SDNodeFlags {
   void setNoUnsignedWrap(bool b) { NoUnsignedWrap = b; }
   void setNoSignedWrap(bool b) { NoSignedWrap = b; }
   void setExact(bool b) { Exact = b; }
+  void setDisjoint(bool b) { Disjoint = b; }
   void setNonNeg(bool b) { NonNeg = b; }
   void setNoNaNs(bool b) { NoNaNs = b; }
   void setNoInfs(bool b) { NoInfs = b; }
@@ -437,6 +440,7 @@ struct SDNodeFlags {
   bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
   bool hasNoSignedWrap() const { return NoSignedWrap; }
   bool hasExact() const { return Exact; }
+  bool hasDisjoint() const { return Disjoint; }
   bool hasNonNeg() const { return NonNeg; }
   bool hasNoNaNs() const { return NoNaNs; }
   bool hasNoInfs() const { return NoInfs; }
@@ -454,6 +458,7 @@ struct SDNodeFlags {
     NoUnsignedWrap &= Flags.NoUnsignedWrap;
     NoSignedWrap &= Flags.NoSignedWrap;
     Exact &= Flags.Exact;
+    Disjoint &= Flags.Disjoint;
     NonNeg &= Flags.NonNeg;
     NoNaNs &= Flags.NoNaNs;
     NoInfs &= Flags.NoInfs;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0e17bba2398ed..75c10a74cdc44 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5022,7 +5022,6 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::CONCAT_VECTORS:
   case ISD::INSERT_SUBVECTOR:
   case ISD::AND:
-  case ISD::OR:
   case ISD::XOR:
   case ISD::ROTL:
   case ISD::ROTR:
@@ -5062,6 +5061,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() ||
                              Op->getFlags().hasNoUnsignedWrap());
 
+  // Matches hasPoisonGeneratingFlags().
+  case ISD::OR:
+    return ConsiderFlags && Op->getFlags().hasDisjoint();
+
   case ISD::INSERT_VECTOR_ELT:{
     // Ensure that the element index is in bounds.
     EVT VecVT = Op.getOperand(0).getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3c4b285cb0674..192f7bc8d2aa1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3354,6 +3354,8 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
   }
   if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
     Flags.setExact(ExactOp->isExact());
+  if (auto *DisjointOp = dyn_cast<PossiblyDisjointInst>(&I))
+    Flags.setDisjoint(DisjointOp->isDisjoint());
   if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
     Flags.copyFMF(*FPOp);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 78cc60084068a..4ae30000015e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -597,6 +597,9 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   if (getFlags().hasExact())
     OS << " exact";
 
+  if (getFlags().hasDisjoint())
+    OS << " disjoint";
+
   if (getFlags().hasNonNeg())
     OS << " nneg";
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c5977546828f6..4581bb19e97ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1468,14 +1468,24 @@ bool TargetLowering::SimplifyDemandedBits(
   case ISD::OR: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
-
+    SDNodeFlags Flags = Op.getNode()->getFlags();
     if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
-                             Depth + 1))
+                             Depth + 1)) {
+      if (Flags.hasDisjoint()) {
+        Flags.setDisjoint(false);
+        Op->setFlags(Flags);
+      }
       return true;
+    }
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, DemandedElts,
-                             Known2, TLO, Depth + 1))
+                             Known2, TLO, Depth + 1)) {
+      if (Flags.hasDisjoint()) {
+        Flags.setDisjoint(false);
+        Op->setFlags(Flags);
+      }
       return true;
+    }
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If all of the demanded bits are known zero on one side, return the other.

From 7e186d366d6c7def0543acc255931f617e76dff0 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Wed, 3 Jan 2024 13:59:12 +0800
Subject: [PATCH 105/313] [LoongArch] Fix the procossor series mask

Refer PRID_SERIES_MASK definition in linux kernel:
arch/loongarch/include/asm/cpu.h.
---
 llvm/lib/TargetParser/Host.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 2e08c7b12d9d5..32941c013c66e 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1524,7 +1524,8 @@ StringRef sys::getHostCPUName() {
   // Use processor id to detect cpu name.
   uint32_t processor_id;
   __asm__("cpucfg %[prid], $zero\n\t" : [prid] "=r"(processor_id));
-  switch (processor_id & 0xff00) {
+  // Refer PRID_SERIES_MASK in linux kernel: arch/loongarch/include/asm/cpu.h.
+  switch (processor_id & 0xf000) {
   case 0xc000: // Loongson 64bit, 4-issue
     return "la464";
   // TODO: Others.

From 5b5614c92fb2003a6b40edde4f036e2e65473561 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 3 Jan 2024 07:49:20 +0000
Subject: [PATCH 106/313] [AArch64][GlobalISel] Add legalization for
 vecreduce.fmul (#73309)

There are no native operations that we can use for floating point mul,
so lower by splitting the vector into chunks multiple times. There is
still a missing fold for fmul_indexed, that could help the gisel test
cases a bit.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   1 +
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  13 +
 .../GlobalISel/legalize-reduce-fmul.mir       |  31 ++
 .../GlobalISel/legalizer-info-validation.mir  |   4 +-
 llvm/test/CodeGen/AArch64/vecreduce-fmul.ll   | 399 ++++++++++++++----
 5 files changed, 364 insertions(+), 84 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-fmul.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 37e7153be5720..f2eb32e191430 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2831,6 +2831,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_VECREDUCE_FADD:
+  case TargetOpcode::G_VECREDUCE_FMUL:
   case TargetOpcode::G_VECREDUCE_FMIN:
   case TargetOpcode::G_VECREDUCE_FMAX:
   case TargetOpcode::G_VECREDUCE_FMINIMUM:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 1d0e8be80d078..79f716353034f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -989,6 +989,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(1, s16, 8)
       .lower();
 
+  // For fmul reductions we need to split up into individual operations. We
+  // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
+  // smaller types, followed by scalarizing what remains.
+  getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
+      .minScalarOrElt(0, MinFPScalar)
+      .clampMaxNumElements(1, s64, 2)
+      .clampMaxNumElements(1, s32, 4)
+      .clampMaxNumElements(1, s16, 8)
+      .clampMaxNumElements(1, s32, 2)
+      .clampMaxNumElements(1, s16, 4)
+      .scalarize(1)
+      .lower();
+
   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
       .legalFor({{s8, v16s8},
                  {s8, v8s8},
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-fmul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-fmul.mir
new file mode 100644
index 0000000000000..9d9e96d35d90d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-fmul.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+---
+name:            mul_2H
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: mul_2H
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[FMUL]](<4 x s32>)
+    ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(<2 x s32>) = G_FMUL [[UV]], [[UV1]]
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMUL1]](<2 x s32>)
+    ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV3]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMUL2]](s32)
+    ; CHECK-NEXT: $s0 = COPY [[COPY2]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $s0
+    %1:_(<4 x s32>) = COPY $q0
+    %2:_(<4 x s32>) = COPY $q1
+    %0:_(<8 x s32>) = G_CONCAT_VECTORS %1(<4 x s32>), %2(<4 x s32>)
+    %5:_(s32) = nnan ninf nsz arcp contract afn reassoc G_VECREDUCE_FMUL %0(<8 x s32>)
+    $s0 = COPY %5(s32)
+    RET_ReallyLR implicit $s0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 178db852e35b7..866fe6fa8cb3f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -736,8 +736,8 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_VECREDUCE_FMUL (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_VECREDUCE_FMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index ff2c5c44d4121..67b4ebb338248 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -1,13 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_HalfS:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_HalfS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_HalfS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -40,6 +49,27 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
 ; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
 ; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: mul_HalfH:
+; CHECK-GI-NOFP16:       // %bb.0:
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: mul_HalfH:
+; CHECK-GI-FP16:       // %bb.0:
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
 }
@@ -93,17 +123,49 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
 ; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
 ; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: mul_H:
+; CHECK-GI-NOFP16:       // %bb.0:
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: mul_H:
+; CHECK-GI-FP16:       // %bb.0:
+; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
   ret half %r
 }
 
 define float @mul_S(<4 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_S:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_S:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_S:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
   ret float %r
 }
@@ -205,18 +267,56 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
 ; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
 ; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: mul_2H:
+; CHECK-GI-NOFP16:       // %bb.0:
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: mul_2H:
+; CHECK-GI-FP16:       // %bb.0:
+; CHECK-GI-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
   ret half %r
 }
 
 define float @mul_2S(<8 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_2S:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_2S:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_2S:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
   ret float %r
 }
@@ -233,15 +333,26 @@ define double @mul_2D(<4 x double> %bin.rdx)  {
 
 ; added at least one test where the start value is not 1.0.
 define float @mul_S_init_42(<4 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_S_init_42:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    fmul s0, s0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_S_init_42:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmov s1, w8
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    fmul s0, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_S_init_42:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
   ret float %r
 }
@@ -335,6 +446,51 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
 ; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
 ; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: fmul_reduct_reassoc_v8f16:
+; CHECK-GI-NOFP16:       // %bb.0:
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NOFP16-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NOFP16-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: fmul_reduct_reassoc_v8f16:
+; CHECK-GI-FP16:       // %bb.0:
+; CHECK-GI-FP16-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-FP16-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v2.4h
+; CHECK-GI-FP16-NEXT:    fmul v1.4h, v1.4h, v3.4h
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h5, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov h6, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov h7, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
+; CHECK-GI-FP16-NEXT:    fmul h2, h3, h4
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h5
+; CHECK-GI-FP16-NEXT:    fmul h3, h6, h7
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    ret
   %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a)
   %r2 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %b)
   %r = fmul fast half %r1, %r2
@@ -342,15 +498,30 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 }
 
 define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: fmul_reduct_reassoc_v8f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmul_reduct_reassoc_v8f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_reduct_reassoc_v8f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fmul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b)
   %r = fmul fast float %r1, %r2
@@ -358,13 +529,26 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: fmul_reduct_reassoc_v4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
   %r = fmul fast float %r1, %r2
@@ -372,17 +556,31 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
 }
 
 define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: fmul_reduct_reassoc_v4f32_init:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    fmul s1, s1, v1.s[1]
-; CHECK-NEXT:    fmul v2.2s, v2.2s, v3.2s
-; CHECK-NEXT:    fmul s0, s0, s1
-; CHECK-NEXT:    fmul s1, s2, v2.s[1]
-; CHECK-NEXT:    fmul s0, s0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_init:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-SD-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-SD-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT:    fmul s0, s0, s1
+; CHECK-SD-NEXT:    fmul s1, s2, v2.s[1]
+; CHECK-SD-NEXT:    fmul s0, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_init:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    mov s4, v1.s[1]
+; CHECK-GI-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-GI-NEXT:    fmul s1, s1, s4
+; CHECK-GI-NEXT:    mov s3, v2.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    fmul s1, s2, s3
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
   %r = fmul fast float %r1, %r2
@@ -390,14 +588,28 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa
 }
 
 define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: fmul_reduct_reassoc_v4v8f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmul_reduct_reassoc_v4v8f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_reduct_reassoc_v4v8f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b)
   %r = fmul fast float %r1, %r2
@@ -405,13 +617,22 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
 }
 
 define double @fmul_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: fmul_reduct_reassoc_v4f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    fmul d0, d0, v0.d[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    fmul v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    fmul v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-GI-NEXT:    fmul d1, d1, v1.d[1]
+; CHECK-GI-NEXT:    fmul d0, d0, d1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a)
   %r2 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %b)
   %r = fmul fast double %r1, %r2
@@ -419,17 +640,31 @@ define double @fmul_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: fmul_reduct_reassoc_v4f32_extrause:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    fmul s1, s1, v1.s[1]
-; CHECK-NEXT:    fmul s1, s0, s1
-; CHECK-NEXT:    fmul s0, s1, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_extrause:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v2.2s
+; CHECK-SD-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    fmul s0, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_extrause:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s1, s0, s1
+; CHECK-GI-NEXT:    fmul s0, s1, s0
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
   %r = fmul fast float %r1, %r2

From d659bd1635326cfc52460e9a43e08953906903bb Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 3 Jan 2024 07:59:36 +0000
Subject: [PATCH 107/313] [GlobalISel][AArch64] Tail call libcalls. (#74929)

This tries to allow libcalls to be tail called, using a similar method
to DAG where the type is checked to make sure they match, and if so the
backend, through lowerCall checks that the tailcall is valid for all
arguments.
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |  15 +-
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |   5 +-
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 113 ++++++++-----
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   5 +-
 .../AArch64/GISel/AArch64LegalizerInfo.h      |   3 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   5 +-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   3 +-
 llvm/lib/Target/ARM/ARMLegalizerInfo.cpp      |  10 +-
 llvm/lib/Target/ARM/ARMLegalizerInfo.h        |   3 +-
 llvm/lib/Target/Mips/MipsLegalizerInfo.cpp    |   5 +-
 llvm/lib/Target/Mips/MipsLegalizerInfo.h      |   3 +-
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |   5 +-
 .../Target/RISCV/GISel/RISCVLegalizerInfo.h   |   3 +-
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |   5 +-
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h    |   3 +-
 .../AArch64/GlobalISel/legalize-fpmode.mir    |   5 +-
 llvm/test/CodeGen/AArch64/fexplog.ll          | 150 ++++--------------
 llvm/test/CodeGen/AArch64/fpmode.ll           |   5 +-
 llvm/test/CodeGen/AArch64/fpow.ll             |  30 +---
 llvm/test/CodeGen/AArch64/frem.ll             |  30 +---
 llvm/test/CodeGen/AArch64/fsincos.ll          |  78 ++++-----
 llvm/test/CodeGen/AArch64/llvm.exp10.ll       |  30 +---
 .../GlobalISel/LegalizerHelperTest.cpp        |   4 +-
 23 files changed, 205 insertions(+), 313 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 711ba10247c34..916e6dda39919 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -288,11 +288,14 @@ class LegalizerHelper {
 
   // Implements floating-point environment read/write via library function call.
   LegalizeResult createGetStateLibcall(MachineIRBuilder &MIRBuilder,
-                                       MachineInstr &MI);
+                                       MachineInstr &MI,
+                                       LostDebugLocObserver &LocObserver);
   LegalizeResult createSetStateLibcall(MachineIRBuilder &MIRBuilder,
-                                       MachineInstr &MI);
+                                       MachineInstr &MI,
+                                       LostDebugLocObserver &LocObserver);
   LegalizeResult createResetStateLibcall(MachineIRBuilder &MIRBuilder,
-                                         MachineInstr &MI);
+                                         MachineInstr &MI,
+                                         LostDebugLocObserver &LocObserver);
 
 public:
   /// Return the alignment to use for a stack temporary object with the given
@@ -440,13 +443,15 @@ class LegalizerHelper {
 LegalizerHelper::LegalizeResult
 createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
               const CallLowering::ArgInfo &Result,
-              ArrayRef<CallLowering::ArgInfo> Args, CallingConv::ID CC);
+              ArrayRef<CallLowering::ArgInfo> Args, CallingConv::ID CC,
+              LostDebugLocObserver &LocObserver, MachineInstr *MI = nullptr);
 
 /// Helper function that creates the given libcall.
 LegalizerHelper::LegalizeResult
 createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
               const CallLowering::ArgInfo &Result,
-              ArrayRef<CallLowering::ArgInfo> Args);
+              ArrayRef<CallLowering::ArgInfo> Args,
+              LostDebugLocObserver &LocObserver, MachineInstr *MI = nullptr);
 
 /// Create a libcall to memcpy et al.
 LegalizerHelper::LegalizeResult
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index e51a3ec940054..9880e82dd5e15 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -35,6 +35,7 @@ extern cl::opt<bool> DisableGISelLegalityCheck;
 class MachineFunction;
 class raw_ostream;
 class LegalizerHelper;
+class LostDebugLocObserver;
 class MachineInstr;
 class MachineRegisterInfo;
 class MCInstrInfo;
@@ -1288,8 +1289,8 @@ class LegalizerInfo {
                        const MachineRegisterInfo &MRI) const;
 
   /// Called for instructions with the Custom LegalizationAction.
-  virtual bool legalizeCustom(LegalizerHelper &Helper,
-                              MachineInstr &MI) const {
+  virtual bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                              LostDebugLocObserver &LocObserver) const {
     llvm_unreachable("must implement this if custom action is used");
   }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f2eb32e191430..7522353ab966f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -149,7 +149,8 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
   case Custom:
     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
-    return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
+    return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized
+                                                     : UnableToLegalize;
   default:
     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
     return UnableToLegalize;
@@ -567,7 +568,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
 
 /// True if an instruction is in tail position in its caller. Intended for
 /// legalizing libcalls as tail calls when possible.
-static bool isLibCallInTailPosition(MachineInstr &MI,
+static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
+                                    MachineInstr &MI,
                                     const TargetInstrInfo &TII,
                                     MachineRegisterInfo &MRI) {
   MachineBasicBlock &MBB = *MI.getParent();
@@ -596,17 +598,12 @@ static bool isLibCallInTailPosition(MachineInstr &MI,
   //   RET_ReallyLR implicit $x0
   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
   if (Next != MBB.instr_end() && Next->isCopy()) {
-    switch (MI.getOpcode()) {
-    default:
-      llvm_unreachable("unsupported opcode");
-    case TargetOpcode::G_BZERO:
+    if (MI.getOpcode() == TargetOpcode::G_BZERO)
       return false;
-    case TargetOpcode::G_MEMCPY:
-    case TargetOpcode::G_MEMMOVE:
-    case TargetOpcode::G_MEMSET:
-      break;
-    }
 
+    // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
+    // mempy/etc routines return the same parameter. For other it will be the
+    // returned value.
     Register VReg = MI.getOperand(0).getReg();
     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
       return false;
@@ -622,7 +619,7 @@ static bool isLibCallInTailPosition(MachineInstr &MI,
     if (Ret->getNumImplicitOperands() != 1)
       return false;
 
-    if (PReg != Ret->getOperand(0).getReg())
+    if (!Ret->getOperand(0).isReg() || PReg != Ret->getOperand(0).getReg())
       return false;
 
     // Skip over the COPY that we just validated.
@@ -639,34 +636,64 @@ LegalizerHelper::LegalizeResult
 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
                     const CallLowering::ArgInfo &Result,
                     ArrayRef<CallLowering::ArgInfo> Args,
-                    const CallingConv::ID CC) {
+                    const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
+                    MachineInstr *MI) {
   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
 
   CallLowering::CallLoweringInfo Info;
   Info.CallConv = CC;
   Info.Callee = MachineOperand::CreateES(Name);
   Info.OrigRet = Result;
+  if (MI)
+    Info.IsTailCall =
+        (Result.Ty->isVoidTy() ||
+         Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
+        isLibCallInTailPosition(Result, *MI, MIRBuilder.getTII(),
+                                *MIRBuilder.getMRI());
+
   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
   if (!CLI.lowerCall(MIRBuilder, Info))
     return LegalizerHelper::UnableToLegalize;
 
+  if (MI && Info.LoweredTailCall) {
+    assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
+
+    // Check debug locations before removing the return.
+    LocObserver.checkpoint(true);
+
+    // We must have a return following the call (or debug insts) to get past
+    // isLibCallInTailPosition.
+    do {
+      MachineInstr *Next = MI->getNextNode();
+      assert(Next &&
+             (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
+             "Expected instr following MI to be return or debug inst?");
+      // We lowered a tail call, so the call is now the return from the block.
+      // Delete the old return.
+      Next->eraseFromParent();
+    } while (MI->getNextNode());
+
+    // We expect to lose the debug location from the return.
+    LocObserver.checkpoint(false);
+  }
   return LegalizerHelper::Legalized;
 }
 
 LegalizerHelper::LegalizeResult
 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
                     const CallLowering::ArgInfo &Result,
-                    ArrayRef<CallLowering::ArgInfo> Args) {
+                    ArrayRef<CallLowering::ArgInfo> Args,
+                    LostDebugLocObserver &LocObserver, MachineInstr *MI) {
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
   const char *Name = TLI.getLibcallName(Libcall);
   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
-  return createLibcall(MIRBuilder, Name, Result, Args, CC);
+  return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
 }
 
 // Useful for libcalls where all operands have the same type.
 static LegalizerHelper::LegalizeResult
 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
-              Type *OpType) {
+              Type *OpType, LostDebugLocObserver &LocObserver) {
   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
 
   // FIXME: What does the original arg index mean here?
@@ -674,7 +701,8 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
     Args.push_back({MO.getReg(), OpType, 0});
   return createLibcall(MIRBuilder, Libcall,
-                       {MI.getOperand(0).getReg(), OpType, 0}, Args);
+                       {MI.getOperand(0).getReg(), OpType, 0}, Args,
+                       LocObserver, &MI);
 }
 
 LegalizerHelper::LegalizeResult
@@ -733,8 +761,9 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
   Info.Callee = MachineOperand::CreateES(Name);
   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
-  Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
-                    isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
+  Info.IsTailCall =
+      MI.getOperand(MI.getNumOperands() - 1).getImm() &&
+      isLibCallInTailPosition(Info.OrigRet, MI, MIRBuilder.getTII(), MRI);
 
   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
   if (!CLI.lowerCall(MIRBuilder, Info))
@@ -789,11 +818,11 @@ static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
 
 static LegalizerHelper::LegalizeResult
 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
-                  Type *FromType) {
+                  Type *FromType, LostDebugLocObserver &LocObserver) {
   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
-  return createLibcall(MIRBuilder, Libcall,
-                       {MI.getOperand(0).getReg(), ToType, 0},
-                       {{MI.getOperand(1).getReg(), FromType, 0}});
+  return createLibcall(
+      MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ToType, 0},
+      {{MI.getOperand(1).getReg(), FromType, 0}}, LocObserver, &MI);
 }
 
 static RTLIB::Libcall
@@ -829,7 +858,8 @@ getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
 //
 LegalizerHelper::LegalizeResult
 LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
-                                       MachineInstr &MI) {
+                                       MachineInstr &MI,
+                                       LostDebugLocObserver &LocObserver) {
   const DataLayout &DL = MIRBuilder.getDataLayout();
   auto &MF = MIRBuilder.getMF();
   auto &MRI = *MIRBuilder.getMRI();
@@ -850,7 +880,8 @@ LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
   auto Res =
       createLibcall(MIRBuilder, RTLibcall,
                     CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
-                    CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}));
+                    CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
+                    LocObserver, nullptr);
   if (Res != LegalizerHelper::Legalized)
     return Res;
 
@@ -867,7 +898,8 @@ LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
 // content of memory region.
 LegalizerHelper::LegalizeResult
 LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
-                                       MachineInstr &MI) {
+                                       MachineInstr &MI,
+                                       LostDebugLocObserver &LocObserver) {
   const DataLayout &DL = MIRBuilder.getDataLayout();
   auto &MF = MIRBuilder.getMF();
   auto &MRI = *MIRBuilder.getMRI();
@@ -892,7 +924,8 @@ LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
   return createLibcall(MIRBuilder, RTLibcall,
                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
-                       CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}));
+                       CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
+                       LocObserver, nullptr);
 }
 
 // The function is used to legalize operations that set default environment
@@ -902,7 +935,8 @@ LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
 // it is not true, the target must provide custom lowering.
 LegalizerHelper::LegalizeResult
 LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
-                                         MachineInstr &MI) {
+                                         MachineInstr &MI,
+                                         LostDebugLocObserver &LocObserver) {
   const DataLayout &DL = MIRBuilder.getDataLayout();
   auto &MF = MIRBuilder.getMF();
   auto &Ctx = MF.getFunction().getContext();
@@ -919,7 +953,8 @@ LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
   return createLibcall(MIRBuilder, RTLibcall,
                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
-                       CallLowering::ArgInfo({ Dest.getReg(), StatePtrTy, 0}));
+                       CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
+                       LocObserver, &MI);
 }
 
 LegalizerHelper::LegalizeResult
@@ -938,7 +973,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
     unsigned Size = LLTy.getSizeInBits();
     Type *HLTy = IntegerType::get(Ctx, Size);
-    auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
+    auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
     if (Status != Legalized)
       return Status;
     break;
@@ -974,7 +1009,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
       return UnableToLegalize;
     }
-    auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
+    auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
     if (Status != Legalized)
       return Status;
     break;
@@ -985,7 +1020,8 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
     if (!FromTy || !ToTy)
       return UnableToLegalize;
-    LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
+    LegalizeResult Status =
+        conversionLibcall(MI, MIRBuilder, ToTy, FromTy, LocObserver);
     if (Status != Legalized)
       return Status;
     break;
@@ -1000,7 +1036,8 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
     LegalizeResult Status = conversionLibcall(
         MI, MIRBuilder,
         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
-        FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
+        FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
+        LocObserver);
     if (Status != Legalized)
       return Status;
     break;
@@ -1015,7 +1052,8 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
     LegalizeResult Status = conversionLibcall(
         MI, MIRBuilder,
         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
-        FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
+        FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
+        LocObserver);
     if (Status != Legalized)
       return Status;
     break;
@@ -1032,19 +1070,20 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
     return Result;
   }
   case TargetOpcode::G_GET_FPMODE: {
-    LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI);
+    LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
     if (Result != Legalized)
       return Result;
     break;
   }
   case TargetOpcode::G_SET_FPMODE: {
-    LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI);
+    LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
     if (Result != Legalized)
       return Result;
     break;
   }
   case TargetOpcode::G_RESET_FPMODE: {
-    LegalizeResult Result = createResetStateLibcall(MIRBuilder, MI);
+    LegalizeResult Result =
+        createResetStateLibcall(MIRBuilder, MI, LocObserver);
     if (Result != Legalized)
       return Result;
     break;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 79f716353034f..bb5f1f69c79c5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1150,8 +1150,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   verify(*ST.getInstrInfo());
 }
 
-bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
-                                          MachineInstr &MI) const {
+bool AArch64LegalizerInfo::legalizeCustom(
+    LegalizerHelper &Helper, MachineInstr &MI,
+    LostDebugLocObserver &LocObserver) const {
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
   GISelChangeObserver &Observer = Helper.Observer;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 19f77baa77f89..e96ec6db3a1bc 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -28,7 +28,8 @@ class AArch64LegalizerInfo : public LegalizerInfo {
 public:
   AArch64LegalizerInfo(const AArch64Subtarget &ST);
 
-  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                      LostDebugLocObserver &LocObserver) const override;
 
   bool legalizeIntrinsic(LegalizerHelper &Helper,
                          MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fbee288894518..dfbe5c7fed882 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1996,8 +1996,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   verify(*ST.getInstrInfo());
 }
 
-bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
-                                         MachineInstr &MI) const {
+bool AMDGPULegalizerInfo::legalizeCustom(
+    LegalizerHelper &Helper, MachineInstr &MI,
+    LostDebugLocObserver &LocObserver) const {
   MachineIRBuilder &B = Helper.MIRBuilder;
   MachineRegisterInfo &MRI = *B.getMRI();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 855fa0ddc214f..1fa064891a2d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -35,7 +35,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   AMDGPULegalizerInfo(const GCNSubtarget &ST,
                       const GCNTargetMachine &TM);
 
-  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                      LostDebugLocObserver &LocObserver) const override;
 
   Register getSegmentAperture(unsigned AddrSpace,
                               MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 3ffde86ce1bb2..abea0fef5cdc5 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -362,8 +362,8 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate,
   llvm_unreachable("Unsupported size for FCmp predicate");
 }
 
-bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
-                                      MachineInstr &MI) const {
+bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                                      LostDebugLocObserver &LocObserver) const {
   using namespace TargetOpcode;
 
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
@@ -392,7 +392,8 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
                           OriginalResult};
     auto Status = createLibcall(MIRBuilder, Libcall, {RetRegs, RetTy, 0},
                                 {{MI.getOperand(1).getReg(), ArgTy, 0},
-                                 {MI.getOperand(2).getReg(), ArgTy, 0}});
+                                 {MI.getOperand(2).getReg(), ArgTy, 0}},
+                                LocObserver, &MI);
     if (Status != LegalizerHelper::Legalized)
       return false;
     break;
@@ -428,7 +429,8 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
       auto Status = createLibcall(MIRBuilder, Libcall.LibcallID,
                                   {LibcallResult, RetTy, 0},
                                   {{MI.getOperand(2).getReg(), ArgTy, 0},
-                                   {MI.getOperand(3).getReg(), ArgTy, 0}});
+                                   {MI.getOperand(3).getReg(), ArgTy, 0}},
+                                  LocObserver, &MI);
 
       if (Status != LegalizerHelper::Legalized)
         return false;
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/llvm/lib/Target/ARM/ARMLegalizerInfo.h
index f1c2e9c943360..3636cc6402b81 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -28,7 +28,8 @@ class ARMLegalizerInfo : public LegalizerInfo {
 public:
   ARMLegalizerInfo(const ARMSubtarget &ST);
 
-  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                      LostDebugLocObserver &LocObserver) const override;
 
 private:
   void setFCmpLibcallsGNU();
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index 14f26201e6c09..f5e94235859a0 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -330,8 +330,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
   verify(*ST.getInstrInfo());
 }
 
-bool MipsLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
-                                       MachineInstr &MI) const {
+bool MipsLegalizerInfo::legalizeCustom(
+    LegalizerHelper &Helper, MachineInstr &MI,
+    LostDebugLocObserver &LocObserver) const {
   using namespace TargetOpcode;
 
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.h b/llvm/lib/Target/Mips/MipsLegalizerInfo.h
index 05027b718a857..63daebf264701 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.h
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.h
@@ -25,7 +25,8 @@ class MipsLegalizerInfo : public LegalizerInfo {
 public:
   MipsLegalizerInfo(const MipsSubtarget &ST);
 
-  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                      LostDebugLocObserver &LocObserver) const override;
 
   bool legalizeIntrinsic(LegalizerHelper &Helper,
                          MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 079906d1958c5..61bae58649258 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -411,8 +411,9 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,
   return true;
 }
 
-bool RISCVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
-                                        MachineInstr &MI) const {
+bool RISCVLegalizerInfo::legalizeCustom(
+    LegalizerHelper &Helper, MachineInstr &MI,
+    LostDebugLocObserver &LocObserver) const {
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
   GISelChangeObserver &Observer = Helper.Observer;
   switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index 48c36976501fc..4335bd0cbbff0 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -30,7 +30,8 @@ class RISCVLegalizerInfo : public LegalizerInfo {
 public:
   RISCVLegalizerInfo(const RISCVSubtarget &ST);
 
-  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                      LostDebugLocObserver &LocObserver) const override;
 
   bool legalizeIntrinsic(LegalizerHelper &Helper,
                          MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index faaf7f0e25489..061bc96742371 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -289,8 +289,9 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpirvType,
   return ConvReg;
 }
 
-bool SPIRVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
-                                        MachineInstr &MI) const {
+bool SPIRVLegalizerInfo::legalizeCustom(
+    LegalizerHelper &Helper, MachineInstr &MI,
+    LostDebugLocObserver &LocObserver) const {
   auto Opc = MI.getOpcode();
   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   if (!isTypeFoldingSupported(Opc)) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
index 2541ff29edb0f..f18b15b7f1696 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
@@ -29,7 +29,8 @@ class SPIRVLegalizerInfo : public LegalizerInfo {
   SPIRVGlobalRegistry *GR;
 
 public:
-  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                      LostDebugLocObserver &LocObserver) const override;
   SPIRVLegalizerInfo(const SPIRVSubtarget &ST);
 };
 } // namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpmode.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpmode.mir
index 14c53902287f3..2d72e5f55d983 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpmode.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpmode.mir
@@ -79,11 +79,8 @@ body:             |
     ; CHECK-LABEL: name: func_reset
     ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[C]](s64)
-    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
     ; CHECK-NEXT: $x0 = COPY [[INTTOPTR]](p0)
-    ; CHECK-NEXT: BL &fesetmode, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0
-    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
-    ; CHECK-NEXT: RET_ReallyLR
+    ; CHECK-NEXT: TCRETURNdi &fesetmode, 0, csr_aarch64_aapcs, implicit $sp, implicit $x0
     G_RESET_FPMODE
     RET_ReallyLR
 
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 26c0b68307b32..2848a6bde3204 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -3,36 +3,18 @@
 ; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @exp_f64(double %a) {
-; CHECK-SD-LABEL: exp_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b exp
-;
-; CHECK-GI-LABEL: exp_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl exp
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: exp_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b exp
 entry:
   %c = call double @llvm.exp.f64(double %a)
   ret double %c
 }
 
 define float @exp_f32(float %a) {
-; CHECK-SD-LABEL: exp_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b expf
-;
-; CHECK-GI-LABEL: exp_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: exp_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b expf
 entry:
   %c = call float @llvm.exp.f32(float %a)
   ret float %c
@@ -1280,36 +1262,18 @@ entry:
 }
 
 define double @exp2_f64(double %a) {
-; CHECK-SD-LABEL: exp2_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b exp2
-;
-; CHECK-GI-LABEL: exp2_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl exp2
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: exp2_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b exp2
 entry:
   %c = call double @llvm.exp2.f64(double %a)
   ret double %c
 }
 
 define float @exp2_f32(float %a) {
-; CHECK-SD-LABEL: exp2_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b exp2f
-;
-; CHECK-GI-LABEL: exp2_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: exp2_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b exp2f
 entry:
   %c = call float @llvm.exp2.f32(float %a)
   ret float %c
@@ -2557,36 +2521,18 @@ entry:
 }
 
 define double @log_f64(double %a) {
-; CHECK-SD-LABEL: log_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b log
-;
-; CHECK-GI-LABEL: log_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl log
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: log_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b log
 entry:
   %c = call double @llvm.log.f64(double %a)
   ret double %c
 }
 
 define float @log_f32(float %a) {
-; CHECK-SD-LABEL: log_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b logf
-;
-; CHECK-GI-LABEL: log_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: log_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b logf
 entry:
   %c = call float @llvm.log.f32(float %a)
   ret float %c
@@ -3834,36 +3780,18 @@ entry:
 }
 
 define double @log2_f64(double %a) {
-; CHECK-SD-LABEL: log2_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b log2
-;
-; CHECK-GI-LABEL: log2_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl log2
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: log2_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b log2
 entry:
   %c = call double @llvm.log2.f64(double %a)
   ret double %c
 }
 
 define float @log2_f32(float %a) {
-; CHECK-SD-LABEL: log2_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b log2f
-;
-; CHECK-GI-LABEL: log2_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: log2_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b log2f
 entry:
   %c = call float @llvm.log2.f32(float %a)
   ret float %c
@@ -5111,36 +5039,18 @@ entry:
 }
 
 define double @log10_f64(double %a) {
-; CHECK-SD-LABEL: log10_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b log10
-;
-; CHECK-GI-LABEL: log10_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl log10
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: log10_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b log10
 entry:
   %c = call double @llvm.log10.f64(double %a)
   ret double %c
 }
 
 define float @log10_f32(float %a) {
-; CHECK-SD-LABEL: log10_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b log10f
-;
-; CHECK-GI-LABEL: log10_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: log10_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b log10f
 entry:
   %c = call float @llvm.log10.f32(float %a)
   ret float %c
diff --git a/llvm/test/CodeGen/AArch64/fpmode.ll b/llvm/test/CodeGen/AArch64/fpmode.ll
index a4f2c4f13cfa6..ebfb0696a95ad 100644
--- a/llvm/test/CodeGen/AArch64/fpmode.ll
+++ b/llvm/test/CodeGen/AArch64/fpmode.ll
@@ -63,11 +63,8 @@ define void @func_reset_fpmode_soft() #0 {
 ;
 ; GIS-LABEL: func_reset_fpmode_soft:
 ; GIS:       // %bb.0: // %entry
-; GIS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GIS-NEXT:    mov x0, #-1 // =0xffffffffffffffff
-; GIS-NEXT:    bl fesetmode
-; GIS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; GIS-NEXT:    ret
+; GIS-NEXT:    b fesetmode
 entry:
   call void @llvm.reset.fpmode()
   ret void
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index 7681ab510b309..a55c0dbffae2b 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -3,36 +3,18 @@
 ; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @pow_f64(double %a, double %b) {
-; CHECK-SD-LABEL: pow_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b pow
-;
-; CHECK-GI-LABEL: pow_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl pow
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: pow_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b pow
 entry:
   %c = call double @llvm.pow.f64(double %a, double %b)
   ret double %c
 }
 
 define float @pow_f32(float %a, float %b) {
-; CHECK-SD-LABEL: pow_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b powf
-;
-; CHECK-GI-LABEL: pow_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: pow_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b powf
 entry:
   %c = call float @llvm.pow.f32(float %a, float %b)
   ret float %c
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index ed0f6c442ece1..eb26128d6469f 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -5,36 +5,18 @@
 ; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @frem_f64(double %a, double %b) {
-; CHECK-SD-LABEL: frem_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b fmod
-;
-; CHECK-GI-LABEL: frem_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl fmod
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: frem_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b fmod
 entry:
   %c = frem double %a, %b
   ret double %c
 }
 
 define float @frem_f32(float %a, float %b) {
-; CHECK-SD-LABEL: frem_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b fmodf
-;
-; CHECK-GI-LABEL: frem_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: frem_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b fmodf
 entry:
   %c = frem float %a, %b
   ret float %c
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 9784b9ea9b65f..aef0b2e29243e 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -3,36 +3,18 @@
 ; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @sin_f64(double %a) {
-; CHECK-SD-LABEL: sin_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b sin
-;
-; CHECK-GI-LABEL: sin_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl sin
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sin_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b sin
 entry:
   %c = call double @llvm.sin.f64(double %a)
   ret double %c
 }
 
 define float @sin_f32(float %a) {
-; CHECK-SD-LABEL: sin_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b sinf
-;
-; CHECK-GI-LABEL: sin_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sin_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b sinf
 entry:
   %c = call float @llvm.sin.f32(float %a)
   ret float %c
@@ -1280,36 +1262,18 @@ entry:
 }
 
 define double @cos_f64(double %a) {
-; CHECK-SD-LABEL: cos_f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b cos
-;
-; CHECK-GI-LABEL: cos_f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl cos
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cos_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b cos
 entry:
   %c = call double @llvm.cos.f64(double %a)
   ret double %c
 }
 
 define float @cos_f32(float %a) {
-; CHECK-SD-LABEL: cos_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    b cosf
-;
-; CHECK-GI-LABEL: cos_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cos_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b cosf
 entry:
   %c = call float @llvm.cos.f32(float %a)
   ret float %c
@@ -2556,6 +2520,24 @@ entry:
   ret <16 x half> %c
 }
 
+; This is testing that we do not produce incorrect tailcall lowerings
+define i64 @donttailcall(double noundef %x, double noundef %y) {
+; CHECK-LABEL: donttailcall:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl sin
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %call = tail call double @llvm.sin.f64(double noundef %x)
+  %0 = bitcast double %call to i64
+  ret i64 %0
+}
+
+
 declare <16 x half> @llvm.cos.v16f16(<16 x half>)
 declare <16 x half> @llvm.sin.v16f16(<16 x half>)
 declare <2 x double> @llvm.cos.v2f64(<2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index c3f14b0dcff71..8e6b15c8ba8d4 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -298,18 +298,9 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 }
 
 define float @exp10_f32(float %x) {
-; SDAG-LABEL: exp10_f32:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    b exp10f
-;
-; GISEL-LABEL: exp10_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    .cfi_offset w30, -16
-; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISEL-NEXT:    ret
+; CHECK-LABEL: exp10_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    b exp10f
   %r = call float @llvm.exp10.f32(float %x)
   ret float %r
 }
@@ -541,18 +532,9 @@ define <4 x float> @exp10_v4f32(<4 x float> %x) {
 }
 
 define double @exp10_f64(double %x) {
-; SDAG-LABEL: exp10_f64:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    b exp10
-;
-; GISEL-LABEL: exp10_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    .cfi_offset w30, -16
-; GISEL-NEXT:    bl exp10
-; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISEL-NEXT:    ret
+; CHECK-LABEL: exp10_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    b exp10
   %r = call double @llvm.exp10.f64(double %x)
   ret double %r
 }
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
index 3fb10db7f6665..d7876b7ce8749 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
@@ -3656,12 +3656,14 @@ TEST_F(AArch64GISelMITest, CreateLibcall) {
 
   AInfo Info(MF->getSubtarget());
   DummyGISelObserver Observer;
+  LostDebugLocObserver DummyLocObserver("");
 
   LLVMContext &Ctx = MF->getFunction().getContext();
   auto *RetTy = Type::getVoidTy(Ctx);
 
   EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized,
-            createLibcall(B, "abort", {{}, RetTy, 0}, {}, CallingConv::C));
+            createLibcall(B, "abort", {{}, RetTy, 0}, {}, CallingConv::C,
+                          DummyLocObserver, nullptr));
 
   auto CheckStr = R"(
   CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp

From b7d5b0d0eeda9bc0c7e8c4a6ee2d4ab6b48eb736 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Wed, 3 Jan 2024 09:51:18 +0100
Subject: [PATCH 108/313] [Orc][examples] Revisit advanced LLJIT examples and
 tests (#76236)

Some maintenance on implementation and tests:
* Drop manual TargetMachineBuilder setup
* Drop addDebugSupport() in favor of Orc's enableDebuggerSupport()
* Check that debug support plugins append jit_code_entry
* Update and reduce sample input
---
 .../LLJITWithRemoteDebugging/CMakeLists.txt   |  1 +
 .../LLJITWithRemoteDebugging.cpp              | 26 ++-------
 .../RemoteJITUtils.cpp                        | 25 +++------
 .../LLJITWithRemoteDebugging/RemoteJITUtils.h |  2 -
 .../OrcV2Examples/Inputs/argc_sub1.ll         | 16 ++++++
 .../OrcV2Examples/Inputs/argc_sub1_elf.ll     | 51 -----------------
 .../lljit-with-remote-debugging.test          | 16 ++++--
 .../llvm-jitlink-executor.cpp                 | 55 ++++++++++++++++++-
 8 files changed, 93 insertions(+), 99 deletions(-)
 create mode 100644 llvm/test/Examples/OrcV2Examples/Inputs/argc_sub1.ll
 delete mode 100644 llvm/test/Examples/OrcV2Examples/Inputs/argc_sub1_elf.ll

diff --git a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/CMakeLists.txt b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/CMakeLists.txt
index 576603c47f593..51b3925f4a9e7 100644
--- a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/CMakeLists.txt
+++ b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
   ExecutionEngine
   IRReader
   JITLink
+  OrcDebugging
   OrcJIT
   OrcShared
   OrcTargetProcess
diff --git a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/LLJITWithRemoteDebugging.cpp b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/LLJITWithRemoteDebugging.cpp
index 9001125060583..1f69415649e84 100644
--- a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/LLJITWithRemoteDebugging.cpp
+++ b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/LLJITWithRemoteDebugging.cpp
@@ -77,6 +77,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
@@ -174,32 +175,13 @@ int main(int argc, char *argv[]) {
     TSMs.push_back(ExitOnErr(parseExampleModuleFromFile(Path)));
   }
 
-  std::string TT;
-  StringRef MainModuleName;
-  TSMs.front().withModuleDo([&MainModuleName, &TT](Module &M) {
-    MainModuleName = M.getName();
-    TT = M.getTargetTriple();
-    if (TT.empty())
-      TT = sys::getProcessTriple();
-  });
-
-  // Create a target machine that matches the input triple.
-  JITTargetMachineBuilder JTMB((Triple(TT)));
-  JTMB.setCodeModel(CodeModel::Small);
-  JTMB.setRelocationModel(Reloc::PIC_);
-
   // Create LLJIT and destroy it before disconnecting the target process.
   outs() << "Initializing LLJIT for remote executor\n";
-  auto J = ExitOnErr(LLJITBuilder()
-                          .setExecutorProcessControl(std::move(EPC))
-                          .setJITTargetMachineBuilder(std::move(JTMB))
-                          .setObjectLinkingLayerCreator([&](auto &ES, const auto &TT) {
-                            return std::make_unique<ObjectLinkingLayer>(ES);
-                          })
-                          .create());
+  auto J = ExitOnErr(
+      LLJITBuilder().setExecutorProcessControl(std::move(EPC)).create());
 
   // Add plugin for debug support.
-  ExitOnErr(addDebugSupport(J->getObjLinkingLayer()));
+  ExitOnErr(enableDebuggerSupport(*J));
 
   // Load required shared libraries on the remote target and add a generator
   // for each of it, so the compiler can lookup their symbols.
diff --git a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp
index 49f5fcdbef8df..b11d875c6f2d0 100644
--- a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp
+++ b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp
@@ -27,22 +27,6 @@
 using namespace llvm;
 using namespace llvm::orc;
 
-Error addDebugSupport(ObjectLayer &ObjLayer) {
-  ExecutionSession &ES = ObjLayer.getExecutionSession();
-  auto Registrar = createJITLoaderGDBRegistrar(ES);
-  if (!Registrar)
-    return Registrar.takeError();
-
-  auto *ObjLinkingLayer = cast<ObjectLinkingLayer>(&ObjLayer);
-  if (!ObjLinkingLayer)
-    return createStringError(inconvertibleErrorCode(),
-                             "No debug support for given object layer type");
-
-  ObjLinkingLayer->addPlugin(std::make_unique<DebugObjectManagerPlugin>(
-      ES, std::move(*Registrar), true, true));
-  return Error::success();
-}
-
 Expected<std::unique_ptr<DefinitionGenerator>>
 loadDylib(ExecutionSession &ES, StringRef RemotePath) {
   if (auto Handle = ES.getExecutorProcessControl().loadDylib(RemotePath.data()))
@@ -111,11 +95,15 @@ launchLocalExecutor(StringRef ExecutablePath) {
     close(FromExecutor[ReadEnd]);
 
     // Execute the child process.
-    std::unique_ptr<char[]> ExecPath, FDSpecifier;
+    std::unique_ptr<char[]> ExecPath, FDSpecifier, TestOutputFlag;
     {
       ExecPath = std::make_unique<char[]>(ExecutablePath.size() + 1);
       strcpy(ExecPath.get(), ExecutablePath.data());
 
+      const char *TestOutputFlagStr = "test-jitloadergdb";
+      TestOutputFlag = std::make_unique<char[]>(strlen(TestOutputFlagStr) + 1);
+      strcpy(TestOutputFlag.get(), TestOutputFlagStr);
+
       std::string FDSpecifierStr("filedescs=");
       FDSpecifierStr += utostr(ToExecutor[ReadEnd]);
       FDSpecifierStr += ',';
@@ -124,7 +112,8 @@ launchLocalExecutor(StringRef ExecutablePath) {
       strcpy(FDSpecifier.get(), FDSpecifierStr.c_str());
     }
 
-    char *const Args[] = {ExecPath.get(), FDSpecifier.get(), nullptr};
+    char *const Args[] = {ExecPath.get(), TestOutputFlag.get(),
+                          FDSpecifier.get(), nullptr};
     int RC = execvp(ExecPath.get(), Args);
     if (RC != 0)
       return make_error<StringError>(
diff --git a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.h b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.h
index e7cad9facdea6..3663307109bdb 100644
--- a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.h
+++ b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.h
@@ -36,8 +36,6 @@ launchLocalExecutor(llvm::StringRef ExecutablePath);
 llvm::Expected<std::unique_ptr<llvm::orc::SimpleRemoteEPC>>
 connectTCPSocket(llvm::StringRef NetworkAddress);
 
-llvm::Error addDebugSupport(llvm::orc::ObjectLayer &ObjLayer);
-
 llvm::Expected<std::unique_ptr<llvm::orc::DefinitionGenerator>>
 loadDylib(llvm::orc::ExecutionSession &ES, llvm::StringRef RemotePath);
 
diff --git a/llvm/test/Examples/OrcV2Examples/Inputs/argc_sub1.ll b/llvm/test/Examples/OrcV2Examples/Inputs/argc_sub1.ll
new file mode 100644
index 0000000000000..97317710a60a1
--- /dev/null
+++ b/llvm/test/Examples/OrcV2Examples/Inputs/argc_sub1.ll
@@ -0,0 +1,16 @@
+define i32 @sub1(i32 %0) {
+  %2 = add i32 %0, -1
+  ret i32 %2
+}
+
+define i32 @main(i32 %0) {
+  %2 = call i32 @sub1(i32 %0)
+  ret i32 %2
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang 18.0.0git", emissionKind: FullDebug)
+!2 = !DIFile(filename: "argc_sub1.c", directory: ".")
diff --git a/llvm/test/Examples/OrcV2Examples/Inputs/argc_sub1_elf.ll b/llvm/test/Examples/OrcV2Examples/Inputs/argc_sub1_elf.ll
deleted file mode 100644
index 0cdc5e7de844d..0000000000000
--- a/llvm/test/Examples/OrcV2Examples/Inputs/argc_sub1_elf.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; ModuleID = 'argc_sub1.c'
-
-define i32 @sub1(i32) !dbg !8 {
-  call void @llvm.dbg.value(metadata i32 %0, metadata !13, metadata !DIExpression()), !dbg !14
-  %2 = add nsw i32 %0, -1, !dbg !15
-  ret i32 %2, !dbg !16
-}
-
-define i32 @main(i32, i8** nocapture readnone) !dbg !17 {
-  call void @llvm.dbg.value(metadata i32 %0, metadata !24, metadata !DIExpression()), !dbg !26
-  call void @llvm.dbg.value(metadata i8** %1, metadata !25, metadata !DIExpression()), !dbg !27
-  %3 = tail call i32 @sub1(i32 %0), !dbg !28
-  ret i32 %3, !dbg !29
-}
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5, !6}
-!llvm.ident = !{!7}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "argc_sub1.c", directory: "Inputs/")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{i32 7, !"PIC Level", i32 2}
-!7 = !{!"clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)"}
-!8 = distinct !DISubprogram(name: "sub1", scope: !1, file: !1, line: 1, type: !9, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
-!9 = !DISubroutineType(types: !10)
-!10 = !{!11, !11}
-!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!12 = !{!13}
-!13 = !DILocalVariable(name: "x", arg: 1, scope: !8, file: !1, line: 1, type: !11)
-!14 = !DILocation(line: 1, column: 14, scope: !8)
-!15 = !DILocation(line: 1, column: 28, scope: !8)
-!16 = !DILocation(line: 1, column: 19, scope: !8)
-!17 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 2, type: !18, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !23)
-!18 = !DISubroutineType(types: !19)
-!19 = !{!11, !11, !20}
-!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !21, size: 64)
-!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64)
-!22 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
-!23 = !{!24, !25}
-!24 = !DILocalVariable(name: "argc", arg: 1, scope: !17, file: !1, line: 2, type: !11)
-!25 = !DILocalVariable(name: "argv", arg: 2, scope: !17, file: !1, line: 2, type: !20)
-!26 = !DILocation(line: 2, column: 14, scope: !17)
-!27 = !DILocation(line: 2, column: 27, scope: !17)
-!28 = !DILocation(line: 2, column: 42, scope: !17)
-!29 = !DILocation(line: 2, column: 35, scope: !17)
diff --git a/llvm/test/Examples/OrcV2Examples/lljit-with-remote-debugging.test b/llvm/test/Examples/OrcV2Examples/lljit-with-remote-debugging.test
index b0b33503a1eec..83dbf6249cfa1 100644
--- a/llvm/test/Examples/OrcV2Examples/lljit-with-remote-debugging.test
+++ b/llvm/test/Examples/OrcV2Examples/lljit-with-remote-debugging.test
@@ -1,15 +1,21 @@
-# This test makes sure that the example builds and executes as expected.
+# Check that the debug support plugin appends a jit_code_entry to the
+# jit_descriptor of the child process.
+
 # Instructions for debugging can be found in LLJITWithRemoteDebugging.cpp
 
 # REQUIRES: default_triple
 # UNSUPPORTED: target=powerpc64{{.*}}
 
-# RUN: LLJITWithRemoteDebugging %p/Inputs/argc_sub1_elf.ll | FileCheck --check-prefix=CHECK0 %s
-# CHECK0: Parsing input IR code from: {{.*}}/Inputs/argc_sub1_elf.ll
+# RUN: LLJITWithRemoteDebugging %p/Inputs/argc_sub1.ll 2>&1 | FileCheck --check-prefix=CHECK0 %s
+# CHECK0: __jit_debug_descriptor.last_entry = [[BEFORE0:0x[0-9a-f]+]]
+# CHECK0-NOT: __jit_debug_descriptor.last_entry = [[BEFORE0]]
+# CHECK0: Parsing input IR code from: {{.*}}/Inputs/argc_sub1.ll
 # CHECK0: Running: main()
 # CHECK0: Exit code: 0
 
-# RUN: LLJITWithRemoteDebugging %p/Inputs/argc_sub1_elf.ll --args 2nd 3rd 4th | FileCheck --check-prefix=CHECK3 %s
-# CHECK3: Parsing input IR code from: {{.*}}/Inputs/argc_sub1_elf.ll
+# RUN: LLJITWithRemoteDebugging %p/Inputs/argc_sub1.ll --args 2nd 3rd 4th 2>&1 | FileCheck --check-prefix=CHECK3 %s
+# CHECK3: __jit_debug_descriptor.last_entry = [[BEFORE3:0x[0-9a-f]+]]
+# CHECK3-NOT: __jit_debug_descriptor.last_entry = [[BEFORE3]]
+# CHECK3: Parsing input IR code from: {{.*}}/Inputs/argc_sub1.ll
 # CHECK3: Running: main("2nd", "3rd", "4th")
 # CHECK3: Exit code: 3
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
index 71c83f2badb8f..3a05c9b5be24d 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
@@ -112,6 +112,44 @@ int openListener(std::string Host, std::string PortStr) {
 #endif // LLVM_ON_UNIX
 }
 
+// This must be kept in sync with gdb/gdb/jit.h .
+extern "C" {
+
+typedef enum {
+  JIT_NOACTION = 0,
+  JIT_REGISTER_FN,
+  JIT_UNREGISTER_FN
+} jit_actions_t;
+
+struct jit_code_entry {
+  struct jit_code_entry *next_entry;
+  struct jit_code_entry *prev_entry;
+  const char *symfile_addr;
+  uint64_t symfile_size;
+};
+
+struct jit_descriptor {
+  uint32_t version;
+  // This should be jit_actions_t, but we want to be specific about the
+  // bit-width.
+  uint32_t action_flag;
+  struct jit_code_entry *relevant_entry;
+  struct jit_code_entry *first_entry;
+};
+
+// We put information about the JITed function in this global, which the
+// debugger reads.  Make sure to specify the version statically, because the
+// debugger checks the version before we can set it during runtime.
+extern struct jit_descriptor __jit_debug_descriptor;
+
+static void *findLastDebugDescriptorEntryPtr() {
+  struct jit_code_entry *Last = __jit_debug_descriptor.first_entry;
+  while (Last && Last->next_entry)
+    Last = Last->next_entry;
+  return Last;
+}
+}
+
 int main(int argc, char *argv[]) {
 #if LLVM_ENABLE_THREADS
 
@@ -121,10 +159,11 @@ int main(int argc, char *argv[]) {
   int InFD = 0;
   int OutFD = 0;
 
+  std::vector<StringRef> TestOutputFlags;
+
   if (argc < 2)
     printErrorAndExit("insufficient arguments");
   else {
-
     StringRef ConnectArg = argv[FirstProgramArg++];
 #ifndef NDEBUG
     if (ConnectArg == "debug") {
@@ -133,6 +172,11 @@ int main(int argc, char *argv[]) {
     }
 #endif
 
+    while (ConnectArg.starts_with("test-")) {
+      TestOutputFlags.push_back(ConnectArg);
+      ConnectArg = argv[FirstProgramArg++];
+    }
+
     StringRef SpecifierType, Specifier;
     std::tie(SpecifierType, Specifier) = ConnectArg.split('=');
     if (SpecifierType == "filedescs") {
@@ -156,6 +200,10 @@ int main(int argc, char *argv[]) {
       printErrorAndExit("invalid specifier type \"" + SpecifierType + "\"");
   }
 
+  if (llvm::is_contained(TestOutputFlags, "test-jitloadergdb"))
+    fprintf(stderr, "__jit_debug_descriptor.last_entry = 0x%016" PRIx64 "\n",
+            pointerToJITTargetAddress(findLastDebugDescriptorEntryPtr()));
+
   auto Server =
       ExitOnErr(SimpleRemoteEPCServer::Create<FDSimpleRemoteEPCTransport>(
           [](SimpleRemoteEPCServer::Setup &S) -> Error {
@@ -173,6 +221,11 @@ int main(int argc, char *argv[]) {
           InFD, OutFD));
 
   ExitOnErr(Server->waitForDisconnect());
+
+  if (llvm::is_contained(TestOutputFlags, "test-jitloadergdb"))
+    fprintf(stderr, "__jit_debug_descriptor.last_entry = 0x%016" PRIx64 "\n",
+            pointerToJITTargetAddress(findLastDebugDescriptorEntryPtr()));
+
   return 0;
 
 #else

From c09e6905567a6b546bb2fd9e863511a8fb939b19 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 3 Jan 2024 09:51:58 +0100
Subject: [PATCH 109/313] [libc][NFC] Remove `FloatProperties` (#76508)

Access is now done through `FPBits` exclusively.
This patch also renames a few internal structs and uses `T` instead of
`FP` as a template parameter.
---
 libc/fuzzing/stdlib/strtofloat_fuzz.cpp       |   7 +-
 libc/src/__support/FPUtil/FPBits.h            | 131 +++++++++---------
 .../__support/FPUtil/ManipulationFunctions.h  |   6 +-
 libc/src/__support/FPUtil/dyadic_float.h      |  37 +++--
 libc/src/__support/FPUtil/generic/FMA.h       |   7 +-
 .../__support/FPUtil/x86_64/LongDoubleBits.h  |   5 +-
 libc/src/__support/float_to_string.h          |   8 +-
 libc/src/__support/str_to_float.h             |  77 +++++-----
 libc/src/math/generic/exp.cpp                 |   5 +-
 libc/src/math/generic/exp10.cpp               |   5 +-
 libc/src/math/generic/exp2.cpp                |   5 +-
 libc/src/math/generic/exp2f_impl.h            |   2 +-
 libc/src/math/generic/explogxf.h              |  10 +-
 libc/src/math/generic/expm1.cpp               |   5 +-
 libc/src/math/generic/powf.cpp                |  44 +++---
 libc/src/math/generic/range_reduction.h       |   2 +-
 libc/src/math/generic/tanhf.cpp               |   2 +-
 .../stdio/printf_core/float_dec_converter.h   |   9 +-
 libc/test/src/__support/str_to_fp_test.h      |   2 +-
 libc/test/src/math/FrexpTest.h                |   3 +-
 libc/test/src/math/LogbTest.h                 |   3 +-
 libc/test/src/math/SqrtTest.h                 |   3 +-
 libc/test/src/math/smoke/FrexpTest.h          |   3 +-
 libc/test/src/math/smoke/LogbTest.h           |   3 +-
 libc/test/src/math/smoke/SqrtTest.h           |   3 +-
 libc/utils/MPFRWrapper/MPFRUtils.cpp          |   2 +-
 26 files changed, 180 insertions(+), 209 deletions(-)

diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index 0e0d82fd3e8af..affef6fcf549e 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -22,18 +22,17 @@
 
 #include "utils/MPFRWrapper/mpfr_inc.h"
 
-using LIBC_NAMESPACE::fputil::FloatProperties;
+using LIBC_NAMESPACE::fputil::FPBits;
 
 // This function calculates the effective precision for a given float type and
 // exponent. Subnormals have a lower effective precision since they don't
 // necessarily use all of the bits of the mantissa.
 template <typename F> inline constexpr int effective_precision(int exponent) {
-  const int full_precision = FloatProperties<F>::MANTISSA_PRECISION;
+  const int full_precision = FPBits<F>::MANTISSA_PRECISION;
 
   // This is intended to be 0 when the exponent is the lowest normal and
   // increase as the exponent's magnitude increases.
-  const int bits_below_normal =
-      (-exponent) - (FloatProperties<F>::EXP_BIAS - 1);
+  const int bits_below_normal = (-exponent) - (FPBits<F>::EXP_BIAS - 1);
 
   // The precision should be the normal, full precision, minus the bits lost
   // by this being a subnormal, minus one for the implicit leading one.
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index d06625ed13852..8304b76d3d8ad 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -39,64 +39,66 @@ enum class FPEncoding {
   X86_ExtendedPrecision,
 };
 
-template <FPType> struct FPBaseProperties {};
+// Defines the layout (sign, exponent, significand) of a floating point type in
+// memory. It also defines its associated StorageType, i.e., the unsigned
+// integer type used to manipulate its representation.
+template <FPType> struct FPLayout {};
 
-template <> struct FPBaseProperties<FPType::IEEE754_Binary16> {
+template <> struct FPLayout<FPType::IEEE754_Binary16> {
   using StorageType = uint16_t;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 16;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 10;
+  LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 5;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 10;
   LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
 };
 
-template <> struct FPBaseProperties<FPType::IEEE754_Binary32> {
+template <> struct FPLayout<FPType::IEEE754_Binary32> {
   using StorageType = uint32_t;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 32;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 23;
+  LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 8;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 23;
   LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
 };
 
-template <> struct FPBaseProperties<FPType::IEEE754_Binary64> {
+template <> struct FPLayout<FPType::IEEE754_Binary64> {
   using StorageType = uint64_t;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 64;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 52;
+  LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 11;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 52;
   LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
 };
 
-template <> struct FPBaseProperties<FPType::IEEE754_Binary128> {
+template <> struct FPLayout<FPType::IEEE754_Binary128> {
   using StorageType = UInt128;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 128;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 112;
+  LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 112;
   LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
 };
 
-template <> struct FPBaseProperties<FPType::X86_Binary80> {
+template <> struct FPLayout<FPType::X86_Binary80> {
   using StorageType = UInt128;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 80;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 64;
+  LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 64;
   LIBC_INLINE_VAR static constexpr auto ENCODING =
       FPEncoding::X86_ExtendedPrecision;
 };
 
 } // namespace internal
 
+// FPBaseMasksAndShifts derives useful constants from the FPLayout.
 template <FPType fp_type>
-struct FPProperties : public internal::FPBaseProperties<fp_type> {
+struct FPBaseMasksAndShifts : public internal::FPLayout<fp_type> {
 private:
-  using UP = internal::FPBaseProperties<fp_type>;
+  using UP = internal::FPLayout<fp_type>;
 
 public:
-  // The number of bits to represent sign. For documentation purpose, always 1.
-  LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
-  using UP::EXP_LEN;   // The number of bits for the *exponent* part
-  using UP::SIG_LEN;   // The number of bits for the *significand* part
-  using UP::TOTAL_LEN; // For convenience, the sum of `SIG_LEN`, `EXP_LEN`,
-                       // and `SIGN_LEN`.
-  static_assert(SIGN_LEN + EXP_LEN + SIG_LEN == TOTAL_LEN);
+  using UP::EXP_LEN;  // The number of bits for the *exponent* part
+  using UP::SIG_LEN;  // The number of bits for the *significand* part
+  using UP::SIGN_LEN; // The number of bits for the *sign* part
+  // For convenience, the sum of `SIG_LEN`, `EXP_LEN`, and `SIGN_LEN`.
+  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + EXP_LEN + SIG_LEN;
 
   // An unsigned integer that is wide enough to contain all of the floating
   // point bits.
@@ -173,45 +175,12 @@ struct FPProperties : public internal::FPBaseProperties<fp_type> {
           : bit_at(SIG_LEN - 2);                      // 0b0100...
 };
 
-//-----------------------------------------------------------------------------
-template <typename FP> LIBC_INLINE static constexpr FPType get_fp_type() {
-  if constexpr (cpp::is_same_v<FP, float> && __FLT_MANT_DIG__ == 24)
-    return FPType::IEEE754_Binary32;
-  else if constexpr (cpp::is_same_v<FP, double> && __DBL_MANT_DIG__ == 53)
-    return FPType::IEEE754_Binary64;
-  else if constexpr (cpp::is_same_v<FP, long double>) {
-    if constexpr (__LDBL_MANT_DIG__ == 53)
-      return FPType::IEEE754_Binary64;
-    else if constexpr (__LDBL_MANT_DIG__ == 64)
-      return FPType::X86_Binary80;
-    else if constexpr (__LDBL_MANT_DIG__ == 113)
-      return FPType::IEEE754_Binary128;
-  }
-#if defined(LIBC_COMPILER_HAS_C23_FLOAT16)
-  else if constexpr (cpp::is_same_v<FP, _Float16>)
-    return FPType::IEEE754_Binary16;
-#endif
-#if defined(LIBC_COMPILER_HAS_C23_FLOAT128)
-  else if constexpr (cpp::is_same_v<FP, _Float128>)
-    return FPType::IEEE754_Binary128;
-#endif
-#if defined(LIBC_COMPILER_HAS_FLOAT128_EXTENSION)
-  else if constexpr (cpp::is_same_v<FP, __float128>)
-    return FPType::IEEE754_Binary128;
-#endif
-  else
-    static_assert(cpp::always_false<FP>, "Unsupported type");
-}
-
-template <typename FP>
-struct FloatProperties : public FPProperties<get_fp_type<FP>()> {};
-
 namespace internal {
 
 // This is a temporary class to unify common methods and properties between
 // FPBits and FPBits<long double>.
-template <FPType fp_type> struct FPBitsCommon : private FPProperties<fp_type> {
-  using UP = FPProperties<fp_type>;
+template <FPType fp_type> struct FPRep : private FPBaseMasksAndShifts<fp_type> {
+  using UP = FPBaseMasksAndShifts<fp_type>;
   using typename UP::StorageType;
   using UP::TOTAL_LEN;
 
@@ -227,15 +196,17 @@ template <FPType fp_type> struct FPBitsCommon : private FPProperties<fp_type> {
   using UP::FP_MASK;
   using UP::FRACTION_LEN;
   using UP::FRACTION_MASK;
+  using UP::MANTISSA_PRECISION;
   using UP::SIGN_MASK;
+  using UP::STORAGE_LEN;
 
   // Reinterpreting bits as an integer value and interpreting the bits of an
   // integer value as a floating point value is used in tests. So, a convenient
   // type is provided for such reinterpretations.
   StorageType bits;
 
-  LIBC_INLINE constexpr FPBitsCommon() : bits(0) {}
-  LIBC_INLINE explicit constexpr FPBitsCommon(StorageType bits) : bits(bits) {}
+  LIBC_INLINE constexpr FPRep() : bits(0) {}
+  LIBC_INLINE explicit constexpr FPRep(StorageType bits) : bits(bits) {}
 
   LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
     mantVal &= FRACTION_MASK;
@@ -297,6 +268,37 @@ template <FPType fp_type> struct FPBitsCommon : private FPProperties<fp_type> {
 
 } // namespace internal
 
+// Returns the FPType corresponding to C++ type T on the host.
+template <typename T> LIBC_INLINE static constexpr FPType get_fp_type() {
+  using UnqualT = cpp::remove_cv_t<T>;
+  if constexpr (cpp::is_same_v<UnqualT, float> && __FLT_MANT_DIG__ == 24)
+    return FPType::IEEE754_Binary32;
+  else if constexpr (cpp::is_same_v<UnqualT, double> && __DBL_MANT_DIG__ == 53)
+    return FPType::IEEE754_Binary64;
+  else if constexpr (cpp::is_same_v<UnqualT, long double>) {
+    if constexpr (__LDBL_MANT_DIG__ == 53)
+      return FPType::IEEE754_Binary64;
+    else if constexpr (__LDBL_MANT_DIG__ == 64)
+      return FPType::X86_Binary80;
+    else if constexpr (__LDBL_MANT_DIG__ == 113)
+      return FPType::IEEE754_Binary128;
+  }
+#if defined(LIBC_COMPILER_HAS_C23_FLOAT16)
+  else if constexpr (cpp::is_same_v<UnqualT, _Float16>)
+    return FPType::IEEE754_Binary16;
+#endif
+#if defined(LIBC_COMPILER_HAS_C23_FLOAT128)
+  else if constexpr (cpp::is_same_v<UnqualT, _Float128>)
+    return FPType::IEEE754_Binary128;
+#endif
+#if defined(LIBC_COMPILER_HAS_FLOAT128_EXTENSION)
+  else if constexpr (cpp::is_same_v<UnqualT, __float128>)
+    return FPType::IEEE754_Binary128;
+#endif
+  else
+    static_assert(cpp::always_false<UnqualT>, "Unsupported type");
+}
+
 // A generic class to represent single precision, double precision, and quad
 // precision IEEE 754 floating point formats.
 // On most platforms, the 'float' type corresponds to single precision floating
@@ -305,11 +307,10 @@ template <FPType fp_type> struct FPBitsCommon : private FPProperties<fp_type> {
 // floating numbers. On x86 platforms however, the 'long double' type maps to
 // an x87 floating point format. This format is an IEEE 754 extension format.
 // It is handled as an explicit specialization of this class.
-template <typename T>
-struct FPBits : public internal::FPBitsCommon<get_fp_type<T>()> {
+template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
   static_assert(cpp::is_floating_point_v<T>,
                 "FPBits instantiated with invalid type.");
-  using UP = internal::FPBitsCommon<get_fp_type<T>()>;
+  using UP = internal::FPRep<get_fp_type<T>()>;
   using StorageType = typename UP::StorageType;
   using UP::bits;
 
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index a2064594e63a5..42433b9b8442d 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -174,13 +174,13 @@ LIBC_INLINE T nextafter(T from, U to) {
   } else {
     int_val = FPBits<T>::MIN_SUBNORMAL;
     if (to_bits.get_sign())
-      int_val |= FloatProperties<T>::SIGN_MASK;
+      int_val |= FPBits<T>::SIGN_MASK;
   }
 
-  StorageType exponent_bits = int_val & FloatProperties<T>::EXP_MASK;
+  StorageType exponent_bits = int_val & FPBits<T>::EXP_MASK;
   if (exponent_bits == StorageType(0))
     raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
-  else if (exponent_bits == FloatProperties<T>::EXP_MASK)
+  else if (exponent_bits == FPBits<T>::EXP_MASK)
     raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
 
   return cpp::bit_cast<T>(int_val);
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 561345fd87cfd..ccd3c69bf3db2 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -41,10 +41,10 @@ template <size_t Bits> struct DyadicFloat {
 
   template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
   DyadicFloat(T x) {
-    static_assert(FloatProperties<T>::FRACTION_LEN < Bits);
+    static_assert(FPBits<T>::FRACTION_LEN < Bits);
     FPBits<T> x_bits(x);
     sign = x_bits.get_sign();
-    exponent = x_bits.get_exponent() - FloatProperties<T>::FRACTION_LEN;
+    exponent = x_bits.get_exponent() - FPBits<T>::FRACTION_LEN;
     mantissa = MantissaType(x_bits.get_explicit_mantissa());
     normalize();
   }
@@ -83,21 +83,20 @@ template <size_t Bits> struct DyadicFloat {
   // Output is rounded correctly with respect to the current rounding mode.
   // TODO(lntue): Add support for underflow.
   // TODO(lntue): Test or add specialization for x86 long double.
-  template <typename T, typename = cpp::enable_if_t<
-                            cpp::is_floating_point_v<T> &&
-                                (FloatProperties<T>::FRACTION_LEN < Bits),
-                            void>>
+  template <typename T,
+            typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
+                                            (FPBits<T>::FRACTION_LEN < Bits),
+                                        void>>
   explicit operator T() const {
     // TODO(lntue): Do we need to treat signed zeros properly?
     if (mantissa.is_zero())
       return 0.0;
 
     // Assume that it is normalized, and output is also normal.
-    constexpr uint32_t PRECISION = FloatProperties<T>::MANTISSA_PRECISION;
+    constexpr uint32_t PRECISION = FPBits<T>::MANTISSA_PRECISION;
     using output_bits_t = typename FPBits<T>::StorageType;
 
-    int exp_hi =
-        exponent + static_cast<int>((Bits - 1) + FloatProperties<T>::EXP_BIAS);
+    int exp_hi = exponent + static_cast<int>((Bits - 1) + FPBits<T>::EXP_BIAS);
 
     bool denorm = false;
     uint32_t shift = Bits - PRECISION;
@@ -106,7 +105,7 @@ template <size_t Bits> struct DyadicFloat {
       denorm = true;
       shift = (Bits - PRECISION) + static_cast<uint32_t>(1 - exp_hi);
 
-      exp_hi = FloatProperties<T>::EXP_BIAS;
+      exp_hi = FPBits<T>::EXP_BIAS;
     }
 
     int exp_lo = exp_hi - static_cast<int>(PRECISION) - 1;
@@ -115,7 +114,7 @@ template <size_t Bits> struct DyadicFloat {
 
     T d_hi = FPBits<T>::create_value(sign, exp_hi,
                                      static_cast<output_bits_t>(m_hi) &
-                                         FloatProperties<T>::FRACTION_MASK)
+                                         FPBits<T>::FRACTION_MASK)
                  .get_val();
 
     const MantissaType round_mask = MantissaType(1) << (shift - 1);
@@ -129,15 +128,13 @@ template <size_t Bits> struct DyadicFloat {
     if (LIBC_UNLIKELY(exp_lo <= 0)) {
       // d_lo is denormal, but the output is normal.
       int scale_up_exponent = 2 * PRECISION;
-      T scale_up_factor = FPBits<T>::create_value(sign,
-                                                  FloatProperties<T>::EXP_BIAS +
-                                                      scale_up_exponent,
-                                                  output_bits_t(0))
-                              .get_val();
+      T scale_up_factor =
+          FPBits<T>::create_value(sign, FPBits<T>::EXP_BIAS + scale_up_exponent,
+                                  output_bits_t(0))
+              .get_val();
       T scale_down_factor =
-          FPBits<T>::create_value(
-              sign, FloatProperties<T>::EXP_BIAS - scale_up_exponent,
-              output_bits_t(0))
+          FPBits<T>::create_value(sign, FPBits<T>::EXP_BIAS - scale_up_exponent,
+                                  output_bits_t(0))
               .get_val();
 
       d_lo = FPBits<T>::create_value(sign, exp_lo + scale_up_exponent,
@@ -156,7 +153,7 @@ template <size_t Bits> struct DyadicFloat {
     if (LIBC_UNLIKELY(denorm)) {
       // Output is denormal, simply clear the exponent field.
       output_bits_t clear_exp = output_bits_t(exp_hi)
-                                << FloatProperties<T>::FRACTION_LEN;
+                                << FPBits<T>::FRACTION_LEN;
       output_bits_t r_bits = FPBits<T>(r).uintval() - clear_exp;
       return FPBits<T>(r_bits).get_val();
     }
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 4ba9e1d2be39e..b88089ee679f2 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -94,7 +94,6 @@ LIBC_INLINE bool shift_mantissa(int shift_length, UInt128 &mant) {
 
 template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
   using FPBits = fputil::FPBits<double>;
-  using FloatProp = fputil::FloatProperties<double>;
 
   if (LIBC_UNLIKELY(x == 0 || y == 0 || z == 0)) {
     return x * y + z;
@@ -267,10 +266,10 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
   }
 
   // Remove hidden bit and append the exponent field and sign bit.
-  result = (result & FloatProp::FRACTION_MASK) |
-           (static_cast<uint64_t>(r_exp) << FloatProp::FRACTION_LEN);
+  result = (result & FPBits::FRACTION_MASK) |
+           (static_cast<uint64_t>(r_exp) << FPBits::FRACTION_LEN);
   if (prod_sign) {
-    result |= FloatProp::SIGN_MASK;
+    result |= FPBits::SIGN_MASK;
   }
 
   // Rounding.
diff --git a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
index 1011e61f03fd6..4dc5d25e26982 100644
--- a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
+++ b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
@@ -27,9 +27,8 @@ namespace LIBC_NAMESPACE {
 namespace fputil {
 
 template <>
-struct FPBits<long double>
-    : public internal::FPBitsCommon<FPType::X86_Binary80> {
-  using UP = internal::FPBitsCommon<FPType::X86_Binary80>;
+struct FPBits<long double> : public internal::FPRep<FPType::X86_Binary80> {
+  using UP = internal::FPRep<FPType::X86_Binary80>;
   using StorageType = typename UP::StorageType;
   using UP::bits;
 
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index 923633e3d207f..52442608798a5 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -105,7 +105,7 @@ namespace LIBC_NAMESPACE {
 using BlockInt = uint32_t;
 constexpr uint32_t BLOCK_SIZE = 9;
 
-using FloatProp = fputil::FloatProperties<long double>;
+using FPBits = fputil::FPBits<long double>;
 
 // Larger numbers prefer a slightly larger constant than is used for the smaller
 // numbers.
@@ -382,10 +382,10 @@ LIBC_INLINE uint32_t fast_uint_mod_1e9(const cpp::UInt<MID_INT_SIZE> &val) {
                                (1000000000 * shifted));
 }
 
-LIBC_INLINE uint32_t mul_shift_mod_1e9(const FloatProp::StorageType mantissa,
+LIBC_INLINE uint32_t mul_shift_mod_1e9(const FPBits::StorageType mantissa,
                                        const cpp::UInt<MID_INT_SIZE> &large,
                                        const int32_t shift_amount) {
-  cpp::UInt<MID_INT_SIZE + FloatProp::STORAGE_LEN> val(large);
+  cpp::UInt<MID_INT_SIZE + FPBits::STORAGE_LEN> val(large);
   val = (val * mantissa) >> shift_amount;
   return static_cast<uint32_t>(
       val.div_uint32_times_pow_2(1000000000, 0).value());
@@ -414,7 +414,7 @@ class FloatToString {
   fputil::FPBits<T> float_bits;
   bool is_negative;
   int exponent;
-  FloatProp::StorageType mantissa;
+  FPBits::StorageType mantissa;
 
   static constexpr int FRACTION_LEN = fputil::FPBits<T>::FRACTION_LEN;
   static constexpr int EXP_BIAS = fputil::FPBits<T>::EXP_BIAS;
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 36b512d6972a9..be4b55645417f 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -71,7 +71,6 @@ LIBC_INLINE cpp::optional<ExpandedFloat<T>>
 eisel_lemire(ExpandedFloat<T> init_num,
              RoundDirection round = RoundDirection::Nearest) {
   using FPBits = typename fputil::FPBits<T>;
-  using FloatProp = typename fputil::FloatProperties<T>;
   using StorageType = typename FPBits::StorageType;
 
   StorageType mantissa = init_num.mantissa;
@@ -93,7 +92,7 @@ eisel_lemire(ExpandedFloat<T> init_num,
   mantissa <<= clz;
 
   int32_t exp2 =
-      exp10_to_exp2(exp10) + FloatProp::STORAGE_LEN + FloatProp::EXP_BIAS - clz;
+      exp10_to_exp2(exp10) + FPBits::STORAGE_LEN + FPBits::EXP_BIAS - clz;
 
   // Multiplication
   const uint64_t *power_of_ten =
@@ -110,9 +109,7 @@ eisel_lemire(ExpandedFloat<T> init_num,
   // accuracy, and the most significant bit is ignored.) = 9 bits. Similarly,
   // it's 6 bits for floats in this case.
   const uint64_t halfway_constant =
-      (uint64_t(1) << (FloatProp::STORAGE_LEN -
-                       (FloatProp::FRACTION_LEN + 3))) -
-      1;
+      (uint64_t(1) << (FPBits::STORAGE_LEN - (FPBits::FRACTION_LEN + 3))) - 1;
   if ((high64(first_approx) & halfway_constant) == halfway_constant &&
       low64(first_approx) + mantissa < mantissa) {
     UInt128 low_bits =
@@ -132,10 +129,10 @@ eisel_lemire(ExpandedFloat<T> init_num,
 
   // Shifting to 54 bits for doubles and 25 bits for floats
   StorageType msb = static_cast<StorageType>(high64(final_approx) >>
-                                             (FloatProp::STORAGE_LEN - 1));
+                                             (FPBits::STORAGE_LEN - 1));
   StorageType final_mantissa = static_cast<StorageType>(
       high64(final_approx) >>
-      (msb + FloatProp::STORAGE_LEN - (FloatProp::FRACTION_LEN + 3)));
+      (msb + FPBits::STORAGE_LEN - (FPBits::FRACTION_LEN + 3)));
   exp2 -= static_cast<uint32_t>(1 ^ msb); // same as !msb
 
   if (round == RoundDirection::Nearest) {
@@ -161,14 +158,14 @@ eisel_lemire(ExpandedFloat<T> init_num,
 
   // From 54 to 53 bits for doubles and 25 to 24 bits for floats
   final_mantissa >>= 1;
-  if ((final_mantissa >> (FloatProp::FRACTION_LEN + 1)) > 0) {
+  if ((final_mantissa >> (FPBits::FRACTION_LEN + 1)) > 0) {
     final_mantissa >>= 1;
     ++exp2;
   }
 
   // The if block is equivalent to (but has fewer branches than):
   //   if exp2 <= 0 || exp2 >= 0x7FF { etc }
-  if (static_cast<uint32_t>(exp2) - 1 >= (1 << FloatProp::EXP_LEN) - 2) {
+  if (static_cast<uint32_t>(exp2) - 1 >= (1 << FPBits::EXP_LEN) - 2) {
     return cpp::nullopt;
   }
 
@@ -184,7 +181,6 @@ LIBC_INLINE cpp::optional<ExpandedFloat<long double>>
 eisel_lemire<long double>(ExpandedFloat<long double> init_num,
                           RoundDirection round) {
   using FPBits = typename fputil::FPBits<long double>;
-  using FloatProp = typename fputil::FloatProperties<long double>;
   using StorageType = typename FPBits::StorageType;
 
   StorageType mantissa = init_num.mantissa;
@@ -210,7 +206,7 @@ eisel_lemire<long double>(ExpandedFloat<long double> init_num,
   mantissa <<= clz;
 
   int32_t exp2 =
-      exp10_to_exp2(exp10) + FloatProp::STORAGE_LEN + FloatProp::EXP_BIAS - clz;
+      exp10_to_exp2(exp10) + FPBits::STORAGE_LEN + FPBits::EXP_BIAS - clz;
 
   // Multiplication
   const uint64_t *power_of_ten =
@@ -247,8 +243,7 @@ eisel_lemire<long double>(ExpandedFloat<long double> init_num,
   // accuracy, and the most significant bit is ignored.) = 61 bits. Similarly,
   // it's 12 bits for 128 bit floats in this case.
   constexpr UInt128 HALFWAY_CONSTANT =
-      (UInt128(1) << (FloatProp::STORAGE_LEN - (FloatProp::FRACTION_LEN + 3))) -
-      1;
+      (UInt128(1) << (FPBits::STORAGE_LEN - (FPBits::FRACTION_LEN + 3))) - 1;
 
   if ((final_approx_upper & HALFWAY_CONSTANT) == HALFWAY_CONSTANT &&
       final_approx_lower + mantissa < mantissa) {
@@ -257,10 +252,10 @@ eisel_lemire<long double>(ExpandedFloat<long double> init_num,
 
   // Shifting to 65 bits for 80 bit floats and 113 bits for 128 bit floats
   uint32_t msb =
-      static_cast<uint32_t>(final_approx_upper >> (FloatProp::STORAGE_LEN - 1));
+      static_cast<uint32_t>(final_approx_upper >> (FPBits::STORAGE_LEN - 1));
   StorageType final_mantissa =
       final_approx_upper >>
-      (msb + FloatProp::STORAGE_LEN - (FloatProp::FRACTION_LEN + 3));
+      (msb + FPBits::STORAGE_LEN - (FPBits::FRACTION_LEN + 3));
   exp2 -= static_cast<uint32_t>(1 ^ msb); // same as !msb
 
   if (round == RoundDirection::Nearest) {
@@ -285,14 +280,14 @@ eisel_lemire<long double>(ExpandedFloat<long double> init_num,
   // From 65 to 64 bits for 80 bit floats and 113  to 112 bits for 128 bit
   // floats
   final_mantissa >>= 1;
-  if ((final_mantissa >> (FloatProp::FRACTION_LEN + 1)) > 0) {
+  if ((final_mantissa >> (FPBits::FRACTION_LEN + 1)) > 0) {
     final_mantissa >>= 1;
     ++exp2;
   }
 
   // The if block is equivalent to (but has fewer branches than):
   //   if exp2 <= 0 || exp2 >= MANTISSA_MAX { etc }
-  if (exp2 - 1 >= (1 << FloatProp::EXP_LEN) - 2) {
+  if (exp2 - 1 >= (1 << FPBits::EXP_LEN) - 2) {
     return cpp::nullopt;
   }
 
@@ -321,7 +316,6 @@ LIBC_INLINE FloatConvertReturn<T>
 simple_decimal_conversion(const char *__restrict numStart,
                           RoundDirection round = RoundDirection::Nearest) {
   using FPBits = typename fputil::FPBits<T>;
-  using FloatProp = typename fputil::FloatProperties<T>;
   using StorageType = typename FPBits::StorageType;
 
   int32_t exp2 = 0;
@@ -337,7 +331,7 @@ simple_decimal_conversion(const char *__restrict numStart,
   // If the exponent is too large and can't be represented in this size of
   // float, return inf.
   if (hpd.get_decimal_point() > 0 &&
-      exp10_to_exp2(hpd.get_decimal_point() - 1) > FloatProp::EXP_BIAS) {
+      exp10_to_exp2(hpd.get_decimal_point() - 1) > FPBits::EXP_BIAS) {
     output.num = {0, fputil::FPBits<T>::MAX_BIASED_EXPONENT};
     output.error = ERANGE;
     return output;
@@ -345,8 +339,7 @@ simple_decimal_conversion(const char *__restrict numStart,
   // If the exponent is too small even for a subnormal, return 0.
   if (hpd.get_decimal_point() < 0 &&
       exp10_to_exp2(-hpd.get_decimal_point()) >
-          (FloatProp::EXP_BIAS +
-           static_cast<int32_t>(FloatProp::FRACTION_LEN))) {
+          (FPBits::EXP_BIAS + static_cast<int32_t>(FPBits::FRACTION_LEN))) {
     output.num = {0, 0};
     output.error = ERANGE;
     return output;
@@ -385,7 +378,7 @@ simple_decimal_conversion(const char *__restrict numStart,
   hpd.shift(1);
 
   // Get the biased exponent
-  exp2 += FloatProp::EXP_BIAS;
+  exp2 += FPBits::EXP_BIAS;
 
   // Handle the exponent being too large (and return inf).
   if (exp2 >= FPBits::MAX_BIASED_EXPONENT) {
@@ -395,7 +388,7 @@ simple_decimal_conversion(const char *__restrict numStart,
   }
 
   // Shift left to fill the mantissa
-  hpd.shift(FloatProp::FRACTION_LEN);
+  hpd.shift(FPBits::FRACTION_LEN);
   StorageType final_mantissa = hpd.round_to_integer_type<StorageType>();
 
   // Handle subnormals
@@ -411,13 +404,13 @@ simple_decimal_conversion(const char *__restrict numStart,
     final_mantissa = hpd.round_to_integer_type<StorageType>(round);
 
     // Check if by shifting right we've caused this to round to a normal number.
-    if ((final_mantissa >> FloatProp::FRACTION_LEN) != 0) {
+    if ((final_mantissa >> FPBits::FRACTION_LEN) != 0) {
       ++exp2;
     }
   }
 
   // Check if rounding added a bit, and shift down if that's the case.
-  if (final_mantissa == StorageType(2) << FloatProp::FRACTION_LEN) {
+  if (final_mantissa == StorageType(2) << FPBits::FRACTION_LEN) {
     final_mantissa >>= 1;
     ++exp2;
 
@@ -515,13 +508,12 @@ LIBC_INLINE cpp::optional<ExpandedFloat<T>>
 clinger_fast_path(ExpandedFloat<T> init_num,
                   RoundDirection round = RoundDirection::Nearest) {
   using FPBits = typename fputil::FPBits<T>;
-  using FloatProp = typename fputil::FloatProperties<T>;
   using StorageType = typename FPBits::StorageType;
 
   StorageType mantissa = init_num.mantissa;
   int32_t exp10 = init_num.exponent;
 
-  if ((mantissa >> FloatProp::FRACTION_LEN) > 0) {
+  if ((mantissa >> FPBits::FRACTION_LEN) > 0) {
     return cpp::nullopt;
   }
 
@@ -605,7 +597,7 @@ clinger_fast_path(ExpandedFloat<T> init_num,
 // log10(2^(exponent bias)).
 // The generic approximation uses the fact that log10(2^x) ~= x/3
 template <typename T> constexpr int32_t get_upper_bound() {
-  return fputil::FloatProperties<T>::EXP_BIAS / 3;
+  return fputil::FPBits<T>::EXP_BIAS / 3;
 }
 
 template <> constexpr int32_t get_upper_bound<float>() { return 39; }
@@ -621,11 +613,10 @@ template <> constexpr int32_t get_upper_bound<double>() { return 309; }
 // other out, and subnormal numbers allow for the result to be at the very low
 // end of the final mantissa.
 template <typename T> constexpr int32_t get_lower_bound() {
-  using FloatProp = typename fputil::FloatProperties<T>;
-  return -(
-      (FloatProp::EXP_BIAS +
-       static_cast<int32_t>(FloatProp::FRACTION_LEN + FloatProp::STORAGE_LEN)) /
-      3);
+  using FPBits = typename fputil::FPBits<T>;
+  return -((FPBits::EXP_BIAS +
+            static_cast<int32_t>(FPBits::FRACTION_LEN + FPBits::STORAGE_LEN)) /
+           3);
 }
 
 template <> constexpr int32_t get_lower_bound<float>() {
@@ -723,7 +714,6 @@ LIBC_INLINE FloatConvertReturn<T> binary_exp_to_float(ExpandedFloat<T> init_num,
                                                       bool truncated,
                                                       RoundDirection round) {
   using FPBits = typename fputil::FPBits<T>;
-  using FloatProp = typename fputil::FloatProperties<T>;
   using StorageType = typename FPBits::StorageType;
 
   StorageType mantissa = init_num.mantissa;
@@ -733,7 +723,7 @@ LIBC_INLINE FloatConvertReturn<T> binary_exp_to_float(ExpandedFloat<T> init_num,
 
   // This is the number of leading zeroes a properly normalized float of type T
   // should have.
-  constexpr int32_t INF_EXP = (1 << FloatProp::EXP_LEN) - 1;
+  constexpr int32_t INF_EXP = (1 << FPBits::EXP_LEN) - 1;
 
   // Normalization step 1: Bring the leading bit to the highest bit of
   // StorageType.
@@ -744,26 +734,25 @@ LIBC_INLINE FloatConvertReturn<T> binary_exp_to_float(ExpandedFloat<T> init_num,
   exp2 -= amount_to_shift_left;
 
   // biased_exponent represents the biased exponent of the most significant bit.
-  int32_t biased_exponent =
-      exp2 + FloatProp::STORAGE_LEN + FPBits::EXP_BIAS - 1;
+  int32_t biased_exponent = exp2 + FPBits::STORAGE_LEN + FPBits::EXP_BIAS - 1;
 
   // Handle numbers that're too large and get squashed to inf
   if (biased_exponent >= INF_EXP) {
     // This indicates an overflow, so we make the result INF and set errno.
-    output.num = {0, (1 << FloatProp::EXP_LEN) - 1};
+    output.num = {0, (1 << FPBits::EXP_LEN) - 1};
     output.error = ERANGE;
     return output;
   }
 
   uint32_t amount_to_shift_right =
-      FloatProp::STORAGE_LEN - FloatProp::FRACTION_LEN - 1;
+      FPBits::STORAGE_LEN - FPBits::FRACTION_LEN - 1;
 
   // Handle subnormals.
   if (biased_exponent <= 0) {
     amount_to_shift_right += 1 - biased_exponent;
     biased_exponent = 0;
 
-    if (amount_to_shift_right > FloatProp::STORAGE_LEN) {
+    if (amount_to_shift_right > FPBits::STORAGE_LEN) {
       // Return 0 if the exponent is too small.
       output.num = {0, 0};
       output.error = ERANGE;
@@ -776,10 +765,10 @@ LIBC_INLINE FloatConvertReturn<T> binary_exp_to_float(ExpandedFloat<T> init_num,
   bool round_bit = static_cast<bool>(mantissa & round_bit_mask);
   bool sticky_bit = static_cast<bool>(mantissa & sticky_mask) || truncated;
 
-  if (amount_to_shift_right < FloatProp::STORAGE_LEN) {
+  if (amount_to_shift_right < FPBits::STORAGE_LEN) {
     // Shift the mantissa and clear the implicit bit.
     mantissa >>= amount_to_shift_right;
-    mantissa &= FloatProp::FRACTION_MASK;
+    mantissa &= FPBits::FRACTION_MASK;
   } else {
     mantissa = 0;
   }
@@ -802,7 +791,7 @@ LIBC_INLINE FloatConvertReturn<T> binary_exp_to_float(ExpandedFloat<T> init_num,
     }
   }
 
-  if (mantissa > FloatProp::FRACTION_MASK) {
+  if (mantissa > FPBits::FRACTION_MASK) {
     // Rounding causes the exponent to increase.
     ++biased_exponent;
 
@@ -815,7 +804,7 @@ LIBC_INLINE FloatConvertReturn<T> binary_exp_to_float(ExpandedFloat<T> init_num,
     output.error = ERANGE;
   }
 
-  output.num = {mantissa & FloatProp::FRACTION_MASK, biased_exponent};
+  output.num = {mantissa & FPBits::FRACTION_MASK, biased_exponent};
   return output;
 }
 
diff --git a/libc/src/math/generic/exp.cpp b/libc/src/math/generic/exp.cpp
index c8e36404a7812..a7d1890ee4f9a 100644
--- a/libc/src/math/generic/exp.cpp
+++ b/libc/src/math/generic/exp.cpp
@@ -224,7 +224,6 @@ double set_exceptional(double x) {
 
 LLVM_LIBC_FUNCTION(double, exp, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
-  using FloatProp = typename fputil::FloatProperties<double>;
   FPBits xbits(x);
 
   uint64_t x_u = xbits.uintval();
@@ -385,7 +384,7 @@ LLVM_LIBC_FUNCTION(double, exp, (double x)) {
     if (LIBC_LIKELY(upper == lower)) {
       // to multiply by 2^hi, a fast way is to simply add hi to the exponent
       // field.
-      int64_t exp_hi = static_cast<int64_t>(hi) << FloatProp::FRACTION_LEN;
+      int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
       double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
       return r;
     }
@@ -403,7 +402,7 @@ LLVM_LIBC_FUNCTION(double, exp, (double x)) {
     double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
 
     if (LIBC_LIKELY(upper_dd == lower_dd)) {
-      int64_t exp_hi = static_cast<int64_t>(hi) << FloatProp::FRACTION_LEN;
+      int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
       double r =
           cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
       return r;
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index 92b3a468a9cc9..b05d6ea7f4866 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -274,7 +274,6 @@ double set_exceptional(double x) {
 
 LLVM_LIBC_FUNCTION(double, exp10, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
-  using FloatProp = typename fputil::FloatProperties<double>;
   FPBits xbits(x);
 
   uint64_t x_u = xbits.uintval();
@@ -398,7 +397,7 @@ LLVM_LIBC_FUNCTION(double, exp10, (double x)) {
   if (LIBC_LIKELY(upper == lower)) {
     // To multiply by 2^hi, a fast way is to simply add hi to the exponent
     // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FloatProp::FRACTION_LEN;
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
     double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
     return r;
   }
@@ -465,7 +464,7 @@ LLVM_LIBC_FUNCTION(double, exp10, (double x)) {
   if (LIBC_LIKELY(upper_dd == lower_dd)) {
     // To multiply by 2^hi, a fast way is to simply add hi to the exponent
     // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FloatProp::FRACTION_LEN;
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
     double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
     return r;
   }
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 44aeb14e231bf..ac34ba35232cb 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -249,7 +249,6 @@ double set_exceptional(double x) {
 
 LLVM_LIBC_FUNCTION(double, exp2, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
-  using FloatProp = typename fputil::FloatProperties<double>;
   FPBits xbits(x);
 
   uint64_t x_u = xbits.uintval();
@@ -365,7 +364,7 @@ LLVM_LIBC_FUNCTION(double, exp2, (double x)) {
   if (LIBC_LIKELY(upper == lower)) {
     // To multiply by 2^hi, a fast way is to simply add hi to the exponent
     // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FloatProp::FRACTION_LEN;
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
     double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
     return r;
   }
@@ -379,7 +378,7 @@ LLVM_LIBC_FUNCTION(double, exp2, (double x)) {
   if (LIBC_LIKELY(upper_dd == lower_dd)) {
     // To multiply by 2^hi, a fast way is to simply add hi to the exponent
     // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FloatProp::FRACTION_LEN;
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
     double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
     return r;
   }
diff --git a/libc/src/math/generic/exp2f_impl.h b/libc/src/math/generic/exp2f_impl.h
index 1d86e4d08770c..e6fd65264c721 100644
--- a/libc/src/math/generic/exp2f_impl.h
+++ b/libc/src/math/generic/exp2f_impl.h
@@ -137,7 +137,7 @@ LIBC_INLINE float exp2f(float x) {
   // exp_hi = shift hi to the exponent field of double precision.
   int64_t exp_hi =
       static_cast<int64_t>(static_cast<uint64_t>(k >> ExpBase::MID_BITS)
-                           << fputil::FloatProperties<double>::FRACTION_LEN);
+                           << fputil::FPBits<double>::FRACTION_LEN);
   // mh = 2^hi * 2^mid
   // mh_bits = bit field of mh
   int64_t mh_bits = ExpBase::EXP_2_MID[k & ExpBase::MID_MASK] + exp_hi;
diff --git a/libc/src/math/generic/explogxf.h b/libc/src/math/generic/explogxf.h
index 705bf9f5db1cb..c5e35663acbe1 100644
--- a/libc/src/math/generic/explogxf.h
+++ b/libc/src/math/generic/explogxf.h
@@ -162,7 +162,7 @@ template <class Base> LIBC_INLINE exp_b_reduc_t exp_b_range_reduc(float x) {
   // hi = floor(kd * 2^(-MID_BITS))
   // exp_hi = shift hi to the exponent field of double precision.
   int64_t exp_hi = static_cast<int64_t>((k >> Base::MID_BITS))
-                   << fputil::FloatProperties<double>::FRACTION_LEN;
+                   << fputil::FPBits<double>::FRACTION_LEN;
   // mh = 2^hi * 2^mid
   // mh_bits = bit field of mh
   int64_t mh_bits = Base::EXP_2_MID[k & Base::MID_MASK] + exp_hi;
@@ -235,9 +235,9 @@ template <bool is_sinh> LIBC_INLINE double exp_pm_eval(float x) {
   // hi = floor(kf * 2^(-5))
   // exp_hi = shift hi to the exponent field of double precision.
   int64_t exp_hi_p = static_cast<int64_t>((k_p >> ExpBase::MID_BITS))
-                     << fputil::FloatProperties<double>::FRACTION_LEN;
+                     << fputil::FPBits<double>::FRACTION_LEN;
   int64_t exp_hi_m = static_cast<int64_t>((k_m >> ExpBase::MID_BITS))
-                     << fputil::FloatProperties<double>::FRACTION_LEN;
+                     << fputil::FPBits<double>::FRACTION_LEN;
   // mh_p = 2^(hi + mid)
   // mh_m = 2^(-(hi + mid))
   // mh_bits_* = bit field of mh_*
@@ -342,10 +342,10 @@ LIBC_INLINE static double log_eval(double x) {
 //   double(1.0 + 2^1022 * x) - 1.0 to test how x is rounded in denormal range.
 LIBC_INLINE cpp::optional<double> ziv_test_denorm(int hi, double mid, double lo,
                                                   double err) {
-  using FloatProp = typename fputil::FloatProperties<double>;
+  using FPBits = typename fputil::FPBits<double>;
 
   // Scaling factor = 1/(min normal number) = 2^1022
-  int64_t exp_hi = static_cast<int64_t>(hi + 1022) << FloatProp::FRACTION_LEN;
+  int64_t exp_hi = static_cast<int64_t>(hi + 1022) << FPBits::FRACTION_LEN;
   double mid_hi = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(mid));
   double lo_scaled =
       (lo != 0.0) ? cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(lo))
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index deb3b0adc0ead..8ab341d55a226 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -275,7 +275,6 @@ double set_exceptional(double x) {
 
 LLVM_LIBC_FUNCTION(double, expm1, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
-  using FloatProp = typename fputil::FloatProperties<double>;
   FPBits xbits(x);
 
   bool x_sign = xbits.get_sign();
@@ -468,7 +467,7 @@ LLVM_LIBC_FUNCTION(double, expm1, (double x)) {
   if (LIBC_LIKELY(upper == lower)) {
     // to multiply by 2^hi, a fast way is to simply add hi to the exponent
     // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FloatProp::FRACTION_LEN;
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
     double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
     return r;
   }
@@ -482,7 +481,7 @@ LLVM_LIBC_FUNCTION(double, expm1, (double x)) {
   double lower_dd = r_dd.hi + (r_dd.lo - err_dd);
 
   if (LIBC_LIKELY(upper_dd == lower_dd)) {
-    int64_t exp_hi = static_cast<int64_t>(hi) << FloatProp::FRACTION_LEN;
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
     double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
     return r;
   }
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index 8470eb878e603..49f33b71c5600 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -387,24 +387,24 @@ static constexpr DoubleDouble LOG2_R2_DD[] = {
 };
 
 LIBC_INLINE bool is_odd_integer(float x) {
-  using FloatProp = typename fputil::FloatProperties<float>;
+  using FPBits = typename fputil::FPBits<float>;
   uint32_t x_u = cpp::bit_cast<uint32_t>(x);
-  int32_t x_e = static_cast<int32_t>((x_u & FloatProp::EXP_MASK) >>
-                                     FloatProp::FRACTION_LEN);
-  int32_t lsb = cpp::countr_zero(x_u | FloatProp::EXP_MASK);
+  int32_t x_e =
+      static_cast<int32_t>((x_u & FPBits::EXP_MASK) >> FPBits::FRACTION_LEN);
+  int32_t lsb = cpp::countr_zero(x_u | FPBits::EXP_MASK);
   constexpr int32_t UNIT_EXPONENT =
-      FloatProp::EXP_BIAS + static_cast<int32_t>(FloatProp::FRACTION_LEN);
+      FPBits::EXP_BIAS + static_cast<int32_t>(FPBits::FRACTION_LEN);
   return (x_e + lsb == UNIT_EXPONENT);
 }
 
 LIBC_INLINE bool is_integer(float x) {
-  using FloatProp = typename fputil::FloatProperties<float>;
+  using FPBits = typename fputil::FPBits<float>;
   uint32_t x_u = cpp::bit_cast<uint32_t>(x);
-  int32_t x_e = static_cast<int32_t>((x_u & FloatProp::EXP_MASK) >>
-                                     FloatProp::FRACTION_LEN);
-  int32_t lsb = cpp::countr_zero(x_u | FloatProp::EXP_MASK);
+  int32_t x_e =
+      static_cast<int32_t>((x_u & FPBits::EXP_MASK) >> FPBits::FRACTION_LEN);
+  int32_t lsb = cpp::countr_zero(x_u | FPBits::EXP_MASK);
   constexpr int32_t UNIT_EXPONENT =
-      FloatProp::EXP_BIAS + static_cast<int32_t>(FloatProp::FRACTION_LEN);
+      FPBits::EXP_BIAS + static_cast<int32_t>(FPBits::FRACTION_LEN);
   return (x_e + lsb >= UNIT_EXPONENT);
 }
 
@@ -424,7 +424,6 @@ LIBC_INLINE bool larger_exponent(double a, double b) {
 double powf_double_double(int idx_x, double dx, double y6, double lo6_hi,
                           const DoubleDouble &exp2_hi_mid) {
   using DoubleBits = typename fputil::FPBits<double>;
-  using DoubleProp = typename fputil::FloatProperties<double>;
   // Perform a second range reduction step:
   //   idx2 = round(2^14 * (dx  + 2^-8)) = round ( dx * 2^14 + 2^6)
   //   dx2 = (1 + dx) * r2 - 1
@@ -500,7 +499,7 @@ double powf_double_double(int idx_x, double dx, double y6, double lo6_hi,
     bool lo_sign = DoubleBits(r.lo).get_sign();
     if (hi_sign == lo_sign) {
       ++r_bits;
-    } else if ((r_bits & DoubleProp::FRACTION_MASK) > 0) {
+    } else if ((r_bits & DoubleBits::FRACTION_MASK) > 0) {
       --r_bits;
     }
   }
@@ -512,8 +511,7 @@ double powf_double_double(int idx_x, double dx, double y6, double lo6_hi,
 
 LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
   using FloatBits = typename fputil::FPBits<float>;
-  using FloatProp = typename fputil::FloatProperties<float>;
-  using DoubleProp = typename fputil::FloatProperties<double>;
+  using DoubleBits = typename fputil::FPBits<double>;
   FloatBits xbits(x), ybits(y);
 
   uint32_t x_u = xbits.uintval();
@@ -584,7 +582,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // x^y will be overflow / underflow in single precision.  Set y to a
         // large enough exponent but not too large, so that the computations
         // won't be overflow in double precision.
-        y = cpp::bit_cast<float>((y_u & FloatProp::SIGN_MASK) + 0x4f800000U);
+        y = cpp::bit_cast<float>((y_u & FloatBits::SIGN_MASK) + 0x4f800000U);
       }
     }
   }
@@ -607,11 +605,11 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
       return generic::exp10f(y);
     }
 
-    bool x_sign = x_u >= FloatProp::SIGN_MASK;
+    bool x_sign = x_u >= FloatBits::SIGN_MASK;
 
     switch (x_abs) {
     case 0x0000'0000: { // x = +-0.0f
-      bool x_sign = (x_u >= FloatProp::SIGN_MASK);
+      bool x_sign = (x_u >= FloatBits::SIGN_MASK);
       bool out_sign = x_sign && is_odd_integer(FloatBits(y_u).get_val());
       if (y_u > 0x8000'0000U) {
         // pow(0, negative number) = inf
@@ -623,9 +621,9 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
       return out_sign ? -0.0f : 0.0f;
     }
     case 0x7f80'0000: { // x = +-Inf
-      bool x_sign = (x_u >= FloatProp::SIGN_MASK);
+      bool x_sign = (x_u >= FloatBits::SIGN_MASK);
       bool out_sign = x_sign && is_odd_integer(FloatBits(y_u).get_val());
-      if (y_u >= FloatProp::SIGN_MASK) {
+      if (y_u >= FloatBits::SIGN_MASK) {
         return out_sign ? -0.0f : 0.0f;
       }
       return FloatBits::inf(out_sign);
@@ -669,11 +667,11 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
   x_u = FloatBits(x).uintval();
 
   // Extract exponent field of x.
-  ex += (x_u >> FloatProp::FRACTION_LEN);
+  ex += (x_u >> FloatBits::FRACTION_LEN);
   double e_x = static_cast<double>(ex);
   // Use the highest 7 fractional bits of m_x as the index for look up tables.
-  uint32_t x_mant = x_u & FloatProp::FRACTION_MASK;
-  int idx_x = static_cast<int>(x_mant >> (FloatProp::FRACTION_LEN - 7));
+  uint32_t x_mant = x_u & FloatBits::FRACTION_MASK;
+  int idx_x = static_cast<int>(x_mant >> (FloatBits::FRACTION_LEN - 7));
   // Add the hidden bit to the mantissa.
   // 1 <= m_x < 2
   float m_x = cpp::bit_cast<float>(x_mant | 0x3f800000);
@@ -774,7 +772,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
   int idx_y = hm_i & 0x3f;
 
   // 2^hi
-  int64_t exp_hi_i = (hm_i >> 6) << DoubleProp::FRACTION_LEN;
+  int64_t exp_hi_i = (hm_i >> 6) << DoubleBits::FRACTION_LEN;
   // 2^mid
   int64_t exp_mid_i = cpp::bit_cast<uint64_t>(EXP2_MID1[idx_y].hi);
   // (-1)^sign * 2^hi * 2^mid
diff --git a/libc/src/math/generic/range_reduction.h b/libc/src/math/generic/range_reduction.h
index 84f3cae9e3294..8a75af5877962 100644
--- a/libc/src/math/generic/range_reduction.h
+++ b/libc/src/math/generic/range_reduction.h
@@ -59,7 +59,7 @@ LIBC_INLINE int64_t small_range_reduction(double x, double &y) {
 LIBC_INLINE int64_t large_range_reduction(double x, int x_exp, double &y) {
   int idx = 0;
   y = 0;
-  int x_lsb_exp_m4 = x_exp - fputil::FloatProperties<float>::FRACTION_LEN;
+  int x_lsb_exp_m4 = x_exp - fputil::FPBits<float>::FRACTION_LEN;
 
   // Skipping the first parts of 32/pi such that:
   //   LSB of x * LSB of THIRTYTWO_OVER_PI_28[i] >= 32.
diff --git a/libc/src/math/generic/tanhf.cpp b/libc/src/math/generic/tanhf.cpp
index 073097e1208af..48e78ec2383f2 100644
--- a/libc/src/math/generic/tanhf.cpp
+++ b/libc/src/math/generic/tanhf.cpp
@@ -89,7 +89,7 @@ LLVM_LIBC_FUNCTION(float, tanhf, (float x)) {
   // -hi = floor(-k * 2^(-MID_BITS))
   // exp_mhi = shift -hi to the exponent field of double precision.
   int64_t exp_mhi = static_cast<int64_t>(mk >> ExpBase::MID_BITS)
-                    << fputil::FloatProperties<double>::FRACTION_LEN;
+                    << fputil::FPBits<double>::FRACTION_LEN;
   // mh = 2^(-hi - mid)
   int64_t mh_bits = ExpBase::EXP_2_MID[mk & ExpBase::MID_MASK] + exp_mhi;
   double mh = fputil::FPBits<double>(uint64_t(mh_bits)).get_val();
diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h
index 78ce7af3a060a..cfa351cf017ae 100644
--- a/libc/src/stdio/printf_core/float_dec_converter.h
+++ b/libc/src/stdio/printf_core/float_dec_converter.h
@@ -240,8 +240,7 @@ class FloatWriter {
   // -exponent will never overflow because all long double types we support
   // have at most 15 bits of mantissa and the C standard defines an int as
   // being at least 16 bits.
-  static_assert(fputil::FloatProperties<long double>::EXP_LEN <
-                (sizeof(int) * 8));
+  static_assert(fputil::FPBits<long double>::EXP_LEN < (sizeof(int) * 8));
 
 public:
   LIBC_INLINE FloatWriter(Writer *init_writer, bool init_has_decimal_point,
@@ -474,7 +473,7 @@ LIBC_INLINE int convert_float_decimal_typed(Writer *writer,
                                             const FormatSection &to_conv,
                                             fputil::FPBits<T> float_bits) {
   // signed because later we use -FRACTION_LEN
-  constexpr int32_t FRACTION_LEN = fputil::FloatProperties<T>::FRACTION_LEN;
+  constexpr int32_t FRACTION_LEN = fputil::FPBits<T>::FRACTION_LEN;
   bool is_negative = float_bits.get_sign();
   int exponent = float_bits.get_explicit_exponent();
 
@@ -587,7 +586,7 @@ LIBC_INLINE int convert_float_dec_exp_typed(Writer *writer,
                                             const FormatSection &to_conv,
                                             fputil::FPBits<T> float_bits) {
   // signed because later we use -FRACTION_LEN
-  constexpr int32_t FRACTION_LEN = fputil::FloatProperties<T>::FRACTION_LEN;
+  constexpr int32_t FRACTION_LEN = fputil::FPBits<T>::FRACTION_LEN;
   bool is_negative = float_bits.get_sign();
   int exponent = float_bits.get_explicit_exponent();
   StorageType mantissa = float_bits.get_explicit_mantissa();
@@ -750,7 +749,7 @@ LIBC_INLINE int convert_float_dec_auto_typed(Writer *writer,
                                              const FormatSection &to_conv,
                                              fputil::FPBits<T> float_bits) {
   // signed because later we use -FRACTION_LEN
-  constexpr int32_t FRACTION_LEN = fputil::FloatProperties<T>::FRACTION_LEN;
+  constexpr int32_t FRACTION_LEN = fputil::FPBits<T>::FRACTION_LEN;
   bool is_negative = float_bits.get_sign();
   int exponent = float_bits.get_explicit_exponent();
   StorageType mantissa = float_bits.get_explicit_mantissa();
diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h
index ba6d46293cd00..32a3133093929 100644
--- a/libc/test/src/__support/str_to_fp_test.h
+++ b/libc/test/src/__support/str_to_fp_test.h
@@ -16,7 +16,7 @@
 namespace LIBC_NAMESPACE {
 
 template <typename T> struct LlvmLibcStrToFloatTest : public testing::Test {
-  using StorageType = typename fputil::FloatProperties<T>::StorageType;
+  using StorageType = typename fputil::FPBits<T>::StorageType;
 
   void clinger_fast_path_test(const StorageType inputMantissa,
                               const int32_t inputExp10,
diff --git a/libc/test/src/math/FrexpTest.h b/libc/test/src/math/FrexpTest.h
index 0e66bd5804d58..20ddce807da45 100644
--- a/libc/test/src/math/FrexpTest.h
+++ b/libc/test/src/math/FrexpTest.h
@@ -20,8 +20,7 @@ template <typename T> class FrexpTest : public LIBC_NAMESPACE::testing::Test {
   DECLARE_SPECIAL_CONSTANTS(T)
 
   static constexpr StorageType HIDDEN_BIT =
-      StorageType(1)
-      << LIBC_NAMESPACE::fputil::FloatProperties<T>::FRACTION_LEN;
+      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
 
 public:
   typedef T (*FrexpFunc)(T, int *);
diff --git a/libc/test/src/math/LogbTest.h b/libc/test/src/math/LogbTest.h
index 2049c8ffe950e..196da5e96b076 100644
--- a/libc/test/src/math/LogbTest.h
+++ b/libc/test/src/math/LogbTest.h
@@ -20,8 +20,7 @@ template <typename T> class LogbTest : public LIBC_NAMESPACE::testing::Test {
   DECLARE_SPECIAL_CONSTANTS(T)
 
   static constexpr StorageType HIDDEN_BIT =
-      StorageType(1)
-      << LIBC_NAMESPACE::fputil::FloatProperties<T>::FRACTION_LEN;
+      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
 
 public:
   typedef T (*LogbFunc)(T);
diff --git a/libc/test/src/math/SqrtTest.h b/libc/test/src/math/SqrtTest.h
index 2cfe401c5542a..d58c3ccfdd5a2 100644
--- a/libc/test/src/math/SqrtTest.h
+++ b/libc/test/src/math/SqrtTest.h
@@ -20,8 +20,7 @@ template <typename T> class SqrtTest : public LIBC_NAMESPACE::testing::Test {
   DECLARE_SPECIAL_CONSTANTS(T)
 
   static constexpr StorageType HIDDEN_BIT =
-      StorageType(1)
-      << LIBC_NAMESPACE::fputil::FloatProperties<T>::FRACTION_LEN;
+      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
 
 public:
   typedef T (*SqrtFunc)(T);
diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h
index 21e3fc057cdb9..981872aa128e1 100644
--- a/libc/test/src/math/smoke/FrexpTest.h
+++ b/libc/test/src/math/smoke/FrexpTest.h
@@ -17,8 +17,7 @@ template <typename T> class FrexpTest : public LIBC_NAMESPACE::testing::Test {
   DECLARE_SPECIAL_CONSTANTS(T)
 
   static constexpr StorageType HIDDEN_BIT =
-      StorageType(1)
-      << LIBC_NAMESPACE::fputil::FloatProperties<T>::FRACTION_LEN;
+      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
 
 public:
   typedef T (*FrexpFunc)(T, int *);
diff --git a/libc/test/src/math/smoke/LogbTest.h b/libc/test/src/math/smoke/LogbTest.h
index a0a01a885c104..a2628273cecc9 100644
--- a/libc/test/src/math/smoke/LogbTest.h
+++ b/libc/test/src/math/smoke/LogbTest.h
@@ -17,8 +17,7 @@ template <typename T> class LogbTest : public LIBC_NAMESPACE::testing::Test {
   DECLARE_SPECIAL_CONSTANTS(T)
 
   static constexpr StorageType HIDDEN_BIT =
-      StorageType(1)
-      << LIBC_NAMESPACE::fputil::FloatProperties<T>::FRACTION_LEN;
+      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
 
 public:
   typedef T (*LogbFunc)(T);
diff --git a/libc/test/src/math/smoke/SqrtTest.h b/libc/test/src/math/smoke/SqrtTest.h
index 5e8e099f90f54..edb6e74236e31 100644
--- a/libc/test/src/math/smoke/SqrtTest.h
+++ b/libc/test/src/math/smoke/SqrtTest.h
@@ -17,8 +17,7 @@ template <typename T> class SqrtTest : public LIBC_NAMESPACE::testing::Test {
   DECLARE_SPECIAL_CONSTANTS(T)
 
   static constexpr StorageType HIDDEN_BIT =
-      StorageType(1)
-      << LIBC_NAMESPACE::fputil::FloatProperties<T>::FRACTION_LEN;
+      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
 
 public:
   typedef T (*SqrtFunc)(T);
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 2a079eeb3a995..fca83c4cdc52f 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -49,7 +49,7 @@ template <> struct ExtraPrecision<long double> {
 template <typename T>
 static inline unsigned int get_precision(double ulp_tolerance) {
   if (ulp_tolerance <= 0.5) {
-    return LIBC_NAMESPACE::fputil::FloatProperties<T>::MANTISSA_PRECISION;
+    return LIBC_NAMESPACE::fputil::FPBits<T>::MANTISSA_PRECISION;
   } else {
     return ExtraPrecision<T>::VALUE;
   }

From 3bcee8568a203c9c9a4c487ebdd0e95fea96f619 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Wed, 3 Jan 2024 01:02:55 -0800
Subject: [PATCH 110/313] [AMDGPU] Global ISel for llvm.prefetch (#76183)

---
 .../Target/GlobalISel/SelectionDAGCompat.td   |   1 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   8 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  16 +-
 llvm/lib/Target/AMDGPU/SMInstructions.td      |  25 ++-
 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll     | 144 +++++++-----------
 5 files changed, 90 insertions(+), 104 deletions(-)

diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index f28c1edc3d95d..5e704f0b9a758 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -244,6 +244,7 @@ def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin>;
 def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap>;
 def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap>;
 def : GINodeEquiv<G_FENCE, atomic_fence>;
+def : GINodeEquiv<G_PREFETCH, prefetch>;
 
 // Specifies the GlobalISel equivalents for SelectionDAG's ComplexPattern.
 // Should be used on defs that subclass GIComplexOperandMatcher<>.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 88ef4b5774242..ad8dcda93c365 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2764,7 +2764,9 @@ static bool isConstant(const MachineInstr &MI) {
 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
 
-  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
+  unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
+  const MachineInstr *PtrMI =
+      MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
 
   assert(PtrMI);
 
@@ -2817,6 +2819,10 @@ bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
     return true;
 
+  if (MI.getOpcode() == AMDGPU::G_PREFETCH)
+    return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
+           AMDGPU::SGPRRegBankID;
+
   const Instruction *I = dyn_cast<Instruction>(Ptr);
   return I && I->getMetadata("amdgpu.uniform");
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index fba060464a6e7..92182ec069426 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3263,17 +3263,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       MI.eraseFromParent();
       return;
     }
-    unsigned PtrBank =
-        getRegBankID(MI.getOperand(0).getReg(), MRI, AMDGPU::SGPRRegBankID);
+    Register PtrReg = MI.getOperand(0).getReg();
+    unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
     if (PtrBank == AMDGPU::VGPRRegBankID) {
       MI.eraseFromParent();
       return;
     }
-    // FIXME: There is currently no support for prefetch in global isel.
-    // There is no node equivalence and what's worse there is no MMO produced
-    // for a prefetch on global isel path.
-    // Prefetch does not affect execution so erase it for now.
-    MI.eraseFromParent();
+    unsigned AS = MRI.getType(PtrReg).getAddressSpace();
+    if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
+        AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+      MI.eraseFromParent();
+      return;
+    }
+    applyDefaultMapping(OpdMapper);
     return;
   }
   default:
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 3297847b0360a..be21cf0140fc8 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -977,20 +977,35 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
-multiclass SMPrefetchPat<string type, int cache_type> {
+def i32imm_zero : TImmLeaf <i32, [{
+  return Imm == 0;
+}]>;
+
+def i32imm_one : TImmLeaf <i32, [{
+  return Imm == 1;
+}]>;
+
+multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
   def : GCNPat <
-    (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)),
+    (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, cache_type),
     (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))
   >;
 
   def : GCNPat <
-    (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+    (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, cache_type),
     (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0))
   >;
+
+  def : GCNPat <
+    (smrd_prefetch (i32 SReg_32:$sbase), timm, timm, cache_type),
+    (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type)
+        (i64 (REG_SEQUENCE SReg_64, $sbase, sub0, (i32 (S_MOV_B32 (i32 0))), sub1)),
+        0, (i32 SGPR_NULL), (i8 0))
+  >;
 }
 
-defm : SMPrefetchPat<"INST", 0>;
-defm : SMPrefetchPat<"DATA", 1>;
+defm : SMPrefetchPat<"INST", i32imm_zero>;
+defm : SMPrefetchPat<"DATA", i32imm_one>;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index c287789f8f493..dcf9b3f263ab6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -1,42 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-SDAG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12,GFX12-SDAG %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-GISEL %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12,GFX12-GISEL %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
 
 ; Scalar data prefetch
 
 define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1)
   ret void
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x200, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_data s[0:1], 0x200, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -46,18 +38,14 @@ entry:
 ; Check large offsets
 
 define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr_max_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x7fffff, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_max_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_data s[0:1], 0x7fffff, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr_max_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr_max_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -65,18 +53,14 @@ entry:
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], -0x800000, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_data s[0:1], -0x800000, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr_min_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -96,6 +80,9 @@ define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inre
 ;
 ; GFX12-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
+; GFX12-GISEL-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608
@@ -137,55 +124,43 @@ entry:
 ; Check supported address spaces
 
 define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr_flat:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_flat:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr_flat:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr_flat:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
   ret void
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr_global:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_global:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr_global:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr_global:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
   ret void
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr_constant_32bit:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, 0
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr_constant_32bit:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr_constant_32bit:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1)
   ret void
@@ -194,36 +169,28 @@ entry:
 ; I$ prefetch
 
 define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_inst_sgpr:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_inst_sgpr:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_inst_sgpr:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_sgpr:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0)
   ret void
 }
 
 define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_inst_sgpr_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x80, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_inst_sgpr_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_inst s[0:1], 0x80, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_inst_sgpr_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_sgpr_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -233,18 +200,14 @@ entry:
 ; Check large offsets
 
 define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_inst_sgpr_max_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x7fffff, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_inst_sgpr_max_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_inst s[0:1], 0x7fffff, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_inst_sgpr_max_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_sgpr_max_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -252,18 +215,14 @@ entry:
 }
 
 define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_inst_sgpr_min_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], -0x800000, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX12-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_prefetch_inst s[0:1], -0x800000, null, 0
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_inst_sgpr_min_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -283,6 +242,9 @@ define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inre
 ;
 ; GFX12-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
+; GFX12-GISEL-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608

From 89ddd94516c880715ec78833f02a7af5e2766cb6 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 3 Jan 2024 17:37:16 +0800
Subject: [PATCH 111/313] [X86][NFC] Simplify definitions of
 BLSR/BLSMSK/BEXTR/BZHI

This patch is to extract the NFC in #76709 into a separate commit.
---
 llvm/lib/Target/X86/X86InstrMisc.td | 112 ++++++++++++----------------
 1 file changed, 47 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 305bd74f7bd70..772ed2a9dbe56 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1212,36 +1212,33 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
                       (implicit EFLAGS)]>, TB, XS, Sched<[WriteTZCNTLd]>;
 }
 
-multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
-                  RegisterClass RC, X86MemOperand x86memop,
-                  X86FoldableSchedWrite sched, string Suffix = ""> {
-let hasSideEffects = 0 in {
-  def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
-                    !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8, VEX, VVVV, Sched<[sched]>;
-  let mayLoad = 1 in
-  def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
-                    !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8, VEX, VVVV, Sched<[sched.Folded]>;
-}
+multiclass Bls<string m, Format RegMRM, Format MemMRM, X86TypeInfo t, string Suffix = ""> {
+  let SchedRW = [WriteBLS] in {
+    def rr#Suffix : UnaryOpR<0xF3, RegMRM, m, unaryop_ndd_args, t,
+                             (outs t.RegClass:$dst), []>, T8, VVVV;
+  }
+
+  let SchedRW = [WriteBLS.Folded] in
+    def rm#Suffix : UnaryOpM<0xF3, MemMRM, m, unaryop_ndd_args, t,
+                             (outs t.RegClass:$dst), []>, T8, VVVV;
 }
 
-let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in {
-  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>;
-  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, REX_W;
-  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>;
-  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, REX_W;
-  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>;
-  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, REX_W;
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BLSR32 : Bls<"blsr", MRM1r, MRM1m, Xi32>, VEX;
+  defm BLSR64 : Bls<"blsr", MRM1r, MRM1m, Xi64>, VEX;
+  defm BLSMSK32 : Bls<"blsmsk", MRM2r, MRM2m, Xi32>, VEX;
+  defm BLSMSK64 : Bls<"blsmsk", MRM2r, MRM2m, Xi64>, VEX;
+  defm BLSI32 : Bls<"blsi", MRM3r, MRM3m, Xi32>, VEX;
+  defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64>, VEX;
 }
 
-let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in {
-  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
-  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
-  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
-  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
-  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
-  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
+let Predicates = [HasBMI, In64BitMode], Defs = [EFLAGS] in {
+  defm BLSR32 : Bls<"blsr", MRM1r, MRM1m, Xi32, "_EVEX">, EVEX;
+  defm BLSR64 : Bls<"blsr", MRM1r, MRM1m, Xi64, "_EVEX">, EVEX;
+  defm BLSMSK32 : Bls<"blsmsk", MRM2r, MRM2m, Xi32, "_EVEX">, EVEX;
+  defm BLSMSK64 : Bls<"blsmsk", MRM2r, MRM2m, Xi64, "_EVEX">, EVEX;
+  defm BLSI32 : Bls<"blsi", MRM3r, MRM3m, Xi32, "_EVEX">, EVEX;
+  defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64, "_EVEX">, EVEX;
 }
 
 let Predicates = [HasBMI] in {
@@ -1281,50 +1278,35 @@ let Predicates = [HasBMI] in {
             (BLSI64rr GR64:$src)>;
 }
 
-multiclass bmi4VOp3_base<bits<8> opc, string mnemonic, RegisterClass RC,
-                         X86MemOperand x86memop, SDPatternOperator OpNode,
-                         PatFrag ld_frag, X86FoldableSchedWrite Sched,
-                         string Suffix = ""> {
-  def rr#Suffix : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
-                  T8, VEX, Sched<[Sched]>;
-let mayLoad = 1 in
-  def rm#Suffix : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
-                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
-                     (implicit EFLAGS)]>, T8, VEX,
-                  Sched<[Sched.Folded,
-                         // x86memop:$src1
-                         ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                         ReadDefault,
-                         // RC:$src2
-                         Sched.ReadAfterFold]>;
+multiclass Bmi4VOp3<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
+                    X86FoldableSchedWrite sched, string Suffix = ""> {
+  let SchedRW = [sched], Form = MRMSrcReg4VOp3 in
+    def rr#Suffix : BinOpRR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst),
+                            [(set t.RegClass:$dst, EFLAGS,
+                             (node t.RegClass:$src1, t.RegClass:$src2))]>, T8;
+  let SchedRW = [sched.Folded,
+                 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                 sched.ReadAfterFold], Form =  MRMSrcMem4VOp3 in
+    def rm#Suffix : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst),
+                            [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1),
+                             t.RegClass:$src2))]>, T8;
 }
 
 let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in {
-  defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem,
-                               X86bextr, loadi32, WriteBEXTR>;
-  defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem,
-                               X86bextr, loadi64, WriteBEXTR>, REX_W;
+  defm BEXTR32 : Bmi4VOp3<0xF7, "bextr", Xi32, X86bextr, WriteBEXTR>, VEX;
+  defm BEXTR64 : Bmi4VOp3<0xF7, "bextr", Xi64, X86bextr, WriteBEXTR>, VEX;
 }
 let Predicates = [HasBMI2, NoEGPR], Defs = [EFLAGS] in {
-  defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem,
-                              X86bzhi, loadi32, WriteBZHI>;
-  defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem,
-                              X86bzhi, loadi64, WriteBZHI>, REX_W;
-}
-let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in {
-  defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem,
-                               X86bextr, loadi32, WriteBEXTR, "_EVEX">, EVEX;
-  defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem,
-                               X86bextr, loadi64, WriteBEXTR, "_EVEX">, EVEX, REX_W;
-}
-let Predicates = [HasBMI2, HasEGPR], Defs = [EFLAGS] in {
-  defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem,
-                              X86bzhi, loadi32, WriteBZHI, "_EVEX">, EVEX;
-  defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem,
-                              X86bzhi, loadi64, WriteBZHI, "_EVEX">, EVEX, REX_W;
+  defm BZHI32 : Bmi4VOp3<0xF5, "bzhi", Xi32, X86bzhi, WriteBZHI>, VEX;
+  defm BZHI64 : Bmi4VOp3<0xF5, "bzhi", Xi64, X86bzhi, WriteBZHI>, VEX;
+}
+let Predicates = [HasBMI, HasEGPR, In64BitMode], Defs = [EFLAGS] in {
+  defm BEXTR32 : Bmi4VOp3<0xF7, "bextr", Xi32, X86bextr, WriteBEXTR, "_EVEX">, EVEX;
+  defm BEXTR64 : Bmi4VOp3<0xF7, "bextr", Xi64, X86bextr, WriteBEXTR, "_EVEX">, EVEX;
+}
+let Predicates = [HasBMI2, HasEGPR, In64BitMode], Defs = [EFLAGS] in {
+  defm BZHI32 : Bmi4VOp3<0xF5, "bzhi", Xi32, X86bzhi, WriteBZHI, "_EVEX">, EVEX;
+  defm BZHI64 : Bmi4VOp3<0xF5, "bzhi", Xi64, X86bzhi, WriteBZHI, "_EVEX">, EVEX;
 }
 
 def CountTrailingOnes : SDNodeXForm<imm, [{

From c054d8f47d2642de722d64811d9908b6514ea012 Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto@arm.com>
Date: Wed, 3 Jan 2024 09:49:47 +0000
Subject: [PATCH 112/313] =?UTF-8?q?[AArch64][LLVM]SME2.1=20non-widening=20?=
 =?UTF-8?q?BFloat=20instructions=20now=20depend=20on=20=E2=80=A6=20(#76222?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…SME2 and FEAT_SVE_B16B16

This patch applies the latest changes in:

https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions?lang=en
---
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |   2 +-
 .../{SME2p1 => SME2}/bfadd-diagnostics.s      |   2 +-
 llvm/test/MC/AArch64/{SME2p1 => SME2}/bfadd.s | 110 +++----
 .../{SME2p1 => SME2}/bfclamp-diagnostics.s    |   2 +-
 .../MC/AArch64/{SME2p1 => SME2}/bfclamp.s     |  30 +-
 .../{SME2p1 => SME2}/bfmax-diagnostics.s      |   2 +-
 llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmax.s |  46 +--
 .../{SME2p1 => SME2}/bfmaxnm-diagnostics.s    |   2 +-
 .../MC/AArch64/{SME2p1 => SME2}/bfmaxnm.s     |  46 +--
 .../{SME2p1 => SME2}/bfmin-diagnostics.s      |   2 +-
 llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmin.s |  46 +--
 .../{SME2p1 => SME2}/bfminnm-diagnostics.s    |   2 +-
 .../MC/AArch64/{SME2p1 => SME2}/bfminnm.s     |  46 +--
 .../{SME2p1 => SME2}/bfmla-diagnostics.s      |   2 +-
 llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmla.s | 302 +++++++++---------
 .../{SME2p1 => SME2}/bfmls-diagnostics.s      |   2 +-
 llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmls.s | 302 +++++++++---------
 .../{SME2p1 => SME2}/bfmopa-diagnostics.s     |   2 +-
 .../test/MC/AArch64/{SME2p1 => SME2}/bfmopa.s |  38 +--
 .../{SME2p1 => SME2}/bfmops-diagnostics.s     |   2 +-
 .../test/MC/AArch64/{SME2p1 => SME2}/bfmops.s |  38 +--
 .../{SME2p1 => SME2}/bfsub-diagnostics.s      |   2 +-
 llvm/test/MC/AArch64/{SME2p1 => SME2}/bfsub.s | 110 +++----
 23 files changed, 569 insertions(+), 569 deletions(-)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfadd-diagnostics.s (95%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfadd.s (77%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfclamp-diagnostics.s (93%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfclamp.s (66%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmax-diagnostics.s (95%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmax.s (74%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmaxnm-diagnostics.s (95%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmaxnm.s (74%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmin-diagnostics.s (95%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmin.s (74%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfminnm-diagnostics.s (95%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfminnm.s (74%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmla-diagnostics.s (97%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmla.s (81%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmls-diagnostics.s (97%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmls.s (81%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmopa-diagnostics.s (93%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmopa.s (70%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmops-diagnostics.s (93%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfmops.s (70%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfsub-diagnostics.s (95%)
 rename llvm/test/MC/AArch64/{SME2p1 => SME2}/bfsub.s (77%)

diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 738a52eebad2a..380f6e1fcfdae 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -810,7 +810,7 @@ defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>;
 defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>;
 }
 
-let Predicates = [HasSME2p1, HasB16B16] in {
+let Predicates = [HasSME2, HasB16B16] in {
 defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
 defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
 defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r,  nxv8bf16, null_frag>;
diff --git a/llvm/test/MC/AArch64/SME2p1/bfadd-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfadd-diagnostics.s
similarity index 95%
rename from llvm/test/MC/AArch64/SME2p1/bfadd-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfadd-diagnostics.s
index bc9a11238eca0..cadb4704f45b7 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfadd-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfadd-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Out of range index offset
diff --git a/llvm/test/MC/AArch64/SME2p1/bfadd.s b/llvm/test/MC/AArch64/SME2/bfadd.s
similarity index 77%
rename from llvm/test/MC/AArch64/SME2p1/bfadd.s
rename to llvm/test/MC/AArch64/SME2/bfadd.s
index 0fb553d0a1ed0..0ba009afa11bf 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfadd.s
+++ b/llvm/test/MC/AArch64/SME2/bfadd.s
@@ -1,300 +1,300 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfadd   za.h[w8, 0, vgx2], {z0.h, z1.h}  // 11000001-11100100-00011100-00000000
 // CHECK-INST: bfadd   za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0x1c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41c00 <unknown>
 
 bfadd   za.h[w8, 0], {z0.h - z1.h}  // 11000001-11100100-00011100-00000000
 // CHECK-INST: bfadd   za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0x1c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41c00 <unknown>
 
 bfadd   za.h[w10, 5, vgx2], {z10.h, z11.h}  // 11000001-11100100-01011101-01000101
 // CHECK-INST: bfadd   za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x45,0x5d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45d45 <unknown>
 
 bfadd   za.h[w10, 5], {z10.h - z11.h}  // 11000001-11100100-01011101-01000101
 // CHECK-INST: bfadd   za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x45,0x5d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45d45 <unknown>
 
 bfadd   za.h[w11, 7, vgx2], {z12.h, z13.h}  // 11000001-11100100-01111101-10000111
 // CHECK-INST: bfadd   za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x7d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47d87 <unknown>
 
 bfadd   za.h[w11, 7], {z12.h - z13.h}  // 11000001-11100100-01111101-10000111
 // CHECK-INST: bfadd   za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x7d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47d87 <unknown>
 
 bfadd   za.h[w11, 7, vgx2], {z30.h, z31.h}  // 11000001-11100100-01111111-11000111
 // CHECK-INST: bfadd   za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xc7,0x7f,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47fc7 <unknown>
 
 bfadd   za.h[w11, 7], {z30.h - z31.h}  // 11000001-11100100-01111111-11000111
 // CHECK-INST: bfadd   za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xc7,0x7f,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47fc7 <unknown>
 
 bfadd   za.h[w8, 5, vgx2], {z16.h, z17.h}  // 11000001-11100100-00011110-00000101
 // CHECK-INST: bfadd   za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x05,0x1e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41e05 <unknown>
 
 bfadd   za.h[w8, 5], {z16.h - z17.h}  // 11000001-11100100-00011110-00000101
 // CHECK-INST: bfadd   za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x05,0x1e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41e05 <unknown>
 
 bfadd   za.h[w8, 1, vgx2], {z0.h, z1.h}  // 11000001-11100100-00011100-00000001
 // CHECK-INST: bfadd   za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x1c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41c01 <unknown>
 
 bfadd   za.h[w8, 1], {z0.h - z1.h}  // 11000001-11100100-00011100-00000001
 // CHECK-INST: bfadd   za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x1c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41c01 <unknown>
 
 bfadd   za.h[w10, 0, vgx2], {z18.h, z19.h}  // 11000001-11100100-01011110, 01000000
 // CHECK-INST: bfadd   za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x40,0x5e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45e40 <unknown>
 
 bfadd   za.h[w10, 0], {z18.h - z19.h}  // 11000001-11100100-01011110-01000000
 // CHECK-INST: bfadd   za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x40,0x5e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45e40 <unknown>
 
 bfadd   za.h[w8, 0, vgx2], {z12.h, z13.h}  // 11000001-11100100-00011101-10000000
 // CHECK-INST: bfadd   za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x80,0x1d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41d80 <unknown>
 
 bfadd   za.h[w8, 0], {z12.h - z13.h}  // 11000001-11100100-00011101-10000000
 // CHECK-INST: bfadd   za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x80,0x1d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41d80 <unknown>
 
 bfadd   za.h[w10, 1, vgx2], {z0.h, z1.h}  // 11000001-11100100-01011100-00000001
 // CHECK-INST: bfadd   za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x5c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45c01 <unknown>
 
 bfadd   za.h[w10, 1], {z0.h - z1.h}  // 11000001-11100100-01011100-00000001
 // CHECK-INST: bfadd   za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x5c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45c01 <unknown>
 
 bfadd   za.h[w8, 5, vgx2], {z22.h, z23.h}  // 11000001-11100100-00011110, 11000101
 // CHECK-INST: bfadd   za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xc5,0x1e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41ec5 <unknown>
 
 bfadd   za.h[w8, 5], {z22.h - z23.h}  // 11000001-11100100-00011110-11000101
 // CHECK-INST: bfadd   za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xc5,0x1e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41ec5 <unknown>
 
 bfadd   za.h[w11, 2, vgx2], {z8.h, z9.h}  // 11000001-11100100-01111101-00000010
 // CHECK-INST: bfadd   za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x02,0x7d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47d02 <unknown>
 
 bfadd   za.h[w11, 2], {z8.h - z9.h}  // 11000001-11100100-01111101-00000010
 // CHECK-INST: bfadd   za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x02,0x7d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47d02 <unknown>
 
 bfadd   za.h[w9, 7, vgx2], {z12.h, z13.h}  // 11000001-11100100-00111101-10000111
 // CHECK-INST: bfadd   za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x3d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e43d87 <unknown>
 
 bfadd   za.h[w9, 7], {z12.h - z13.h}  // 11000001-11100100-00111101-10000111
 // CHECK-INST: bfadd   za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x3d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e43d87 <unknown>
 
 bfadd   za.h[w8, 0, vgx4], {z0.h - z3.h}  // 11000001-11100101-00011100-00000000
 // CHECK-INST: bfadd   za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0x1c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51c00 <unknown>
 
 bfadd   za.h[w8, 0], {z0.h - z3.h}  // 11000001-11100101-00011100-00000000
 // CHECK-INST: bfadd   za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0x1c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51c00 <unknown>
 
 bfadd   za.h[w10, 5, vgx4], {z8.h - z11.h}  // 11000001-11100101-01011101-00000101
 // CHECK-INST: bfadd   za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x05,0x5d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55d05 <unknown>
 
 bfadd   za.h[w10, 5], {z8.h - z11.h}  // 11000001-11100101-01011101-00000101
 // CHECK-INST: bfadd   za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x05,0x5d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55d05 <unknown>
 
 bfadd   za.h[w11, 7, vgx4], {z12.h - z15.h}  // 11000001-11100101-01111101-10000111
 // CHECK-INST: bfadd   za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x7d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57d87 <unknown>
 
 bfadd   za.h[w11, 7], {z12.h - z15.h}  // 11000001-11100101-01111101-10000111
 // CHECK-INST: bfadd   za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x7d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57d87 <unknown>
 
 bfadd   za.h[w11, 7, vgx4], {z28.h - z31.h}  // 11000001-11100101-01111111-10000111
 // CHECK-INST: bfadd   za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x87,0x7f,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57f87 <unknown>
 
 bfadd   za.h[w11, 7], {z28.h - z31.h}  // 11000001-11100101-01111111-10000111
 // CHECK-INST: bfadd   za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x87,0x7f,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57f87 <unknown>
 
 bfadd   za.h[w8, 5, vgx4], {z16.h - z19.h}  // 11000001-11100101-00011110-00000101
 // CHECK-INST: bfadd   za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x05,0x1e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51e05 <unknown>
 
 bfadd   za.h[w8, 5], {z16.h - z19.h}  // 11000001-11100101-00011110-00000101
 // CHECK-INST: bfadd   za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x05,0x1e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51e05 <unknown>
 
 bfadd   za.h[w8, 1, vgx4], {z0.h - z3.h}  // 11000001-11100101-00011100-00000001
 // CHECK-INST: bfadd   za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x1c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51c01 <unknown>
 
 bfadd   za.h[w8, 1], {z0.h - z3.h}  // 11000001-11100101-00011100-00000001
 // CHECK-INST: bfadd   za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x1c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51c01 <unknown>
 
 bfadd   za.h[w10, 0, vgx4], {z16.h - z19.h}  // 11000001-11100101-01011110-00000000
 // CHECK-INST: bfadd   za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x00,0x5e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55e00 <unknown>
 
 bfadd   za.h[w10, 0], {z16.h - z19.h}  // 11000001-11100101-01011110-00000000
 // CHECK-INST: bfadd   za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x00,0x5e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55e00 <unknown>
 
 bfadd   za.h[w8, 0, vgx4], {z12.h - z15.h}  // 11000001-11100101-00011101-10000000
 // CHECK-INST: bfadd   za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x80,0x1d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51d80 <unknown>
 
 bfadd   za.h[w8, 0], {z12.h - z15.h}  // 11000001-11100101-00011101-10000000
 // CHECK-INST: bfadd   za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x80,0x1d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51d80 <unknown>
 
 bfadd   za.h[w10, 1, vgx4], {z0.h - z3.h}  // 11000001-11100101-01011100-00000001
 // CHECK-INST: bfadd   za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x5c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55c01 <unknown>
 
 bfadd   za.h[w10, 1], {z0.h - z3.h}  // 11000001-11100101-01011100-00000001
 // CHECK-INST: bfadd   za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x5c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55c01 <unknown>
 
 bfadd   za.h[w8, 5, vgx4], {z20.h - z23.h}  // 11000001-11100101-00011110-10000101
 // CHECK-INST: bfadd   za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x85,0x1e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51e85 <unknown>
 
 bfadd   za.h[w8, 5], {z20.h - z23.h}  // 11000001-11100101-00011110-10000101
 // CHECK-INST: bfadd   za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x85,0x1e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51e85 <unknown>
 
 bfadd   za.h[w11, 2, vgx4], {z8.h - z11.h}  // 11000001-11100101-01111101-00000010
 // CHECK-INST: bfadd   za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x02,0x7d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57d02 <unknown>
 
 bfadd   za.h[w11, 2], {z8.h - z11.h}  // 11000001-11100101-01111101-00000010
 // CHECK-INST: bfadd   za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x02,0x7d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57d02 <unknown>
 
 bfadd   za.h[w9, 7, vgx4], {z12.h - z15.h}  // 11000001-11100101-00111101-10000111
 // CHECK-INST: bfadd   za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x3d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e53d87 <unknown>
 
 bfadd   za.h[w9, 7], {z12.h - z15.h}  // 11000001-11100101-00111101-10000111
 // CHECK-INST: bfadd   za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x3d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e53d87 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfclamp-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfclamp-diagnostics.s
similarity index 93%
rename from llvm/test/MC/AArch64/SME2p1/bfclamp-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfclamp-diagnostics.s
index ee48a212f568b..661cfadc64f1c 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfclamp-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfclamp-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/bfclamp.s b/llvm/test/MC/AArch64/SME2/bfclamp.s
similarity index 66%
rename from llvm/test/MC/AArch64/SME2p1/bfclamp.s
rename to llvm/test/MC/AArch64/SME2/bfclamp.s
index ebf8500eac190..dc3caecc33b78 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfclamp.s
+++ b/llvm/test/MC/AArch64/SME2/bfclamp.s
@@ -1,60 +1,60 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfclamp {z0.h, z1.h}, z0.h, z0.h  // 11000001-00100000-11000000-00000000
 // CHECK-INST: bfclamp { z0.h, z1.h }, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0xc0,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120c000 <unknown>
 
 bfclamp {z20.h, z21.h}, z10.h, z21.h  // 11000001-00110101-11000001-01010100
 // CHECK-INST: bfclamp { z20.h, z21.h }, z10.h, z21.h
 // CHECK-ENCODING: [0x54,0xc1,0x35,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c135c154 <unknown>
 
 bfclamp {z22.h, z23.h}, z13.h, z8.h  // 11000001-00101000-11000001-10110110
 // CHECK-INST: bfclamp { z22.h, z23.h }, z13.h, z8.h
 // CHECK-ENCODING: [0xb6,0xc1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128c1b6 <unknown>
 
 bfclamp {z30.h, z31.h}, z31.h, z31.h  // 11000001-00111111-11000011-11111110
 // CHECK-INST: bfclamp { z30.h, z31.h }, z31.h, z31.h
 // CHECK-ENCODING: [0xfe,0xc3,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13fc3fe <unknown>
 
 bfclamp {z0.h - z3.h}, z0.h, z0.h  // 11000001-00100000-11001000-00000000
 // CHECK-INST: bfclamp { z0.h - z3.h }, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0xc8,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120c800 <unknown>
 
 bfclamp {z20.h - z23.h}, z10.h, z21.h  // 11000001-00110101-11001001-01010100
 // CHECK-INST: bfclamp { z20.h - z23.h }, z10.h, z21.h
 // CHECK-ENCODING: [0x54,0xc9,0x35,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c135c954 <unknown>
 
 bfclamp {z20.h - z23.h}, z13.h, z8.h  // 11000001-00101000-11001001-10110100
 // CHECK-INST: bfclamp { z20.h - z23.h }, z13.h, z8.h
 // CHECK-ENCODING: [0xb4,0xc9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128c9b4 <unknown>
 
 bfclamp {z28.h - z31.h}, z31.h, z31.h  // 11000001-00111111-11001011-11111100
 // CHECK-INST: bfclamp { z28.h - z31.h }, z31.h, z31.h
 // CHECK-ENCODING: [0xfc,0xcb,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13fcbfc <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmax-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfmax-diagnostics.s
similarity index 95%
rename from llvm/test/MC/AArch64/SME2p1/bfmax-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfmax-diagnostics.s
index 55e5755ea6ecc..bbb619ed66639 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmax-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfmax-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmax.s b/llvm/test/MC/AArch64/SME2/bfmax.s
similarity index 74%
rename from llvm/test/MC/AArch64/SME2p1/bfmax.s
rename to llvm/test/MC/AArch64/SME2/bfmax.s
index d5af905c9d496..657fcbc773830 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmax.s
+++ b/llvm/test/MC/AArch64/SME2/bfmax.s
@@ -1,108 +1,108 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfmax   {z0.h, z1.h}, {z0.h, z1.h}, z0.h  // 11000001-00100000-10100001-00000000
 // CHECK-INST: bfmax   { z0.h, z1.h }, { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0xa1,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120a100 <unknown>
 
 bfmax   {z20.h, z21.h}, {z20.h, z21.h}, z5.h  // 11000001-00100101-10100001-00010100
 // CHECK-INST: bfmax   { z20.h, z21.h }, { z20.h, z21.h }, z5.h
 // CHECK-ENCODING: [0x14,0xa1,0x25,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c125a114 <unknown>
 
 bfmax   {z22.h, z23.h}, {z22.h, z23.h}, z8.h  // 11000001-00101000-10100001-00010110
 // CHECK-INST: bfmax   { z22.h, z23.h }, { z22.h, z23.h }, z8.h
 // CHECK-ENCODING: [0x16,0xa1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128a116 <unknown>
 
 bfmax   {z30.h, z31.h}, {z30.h, z31.h}, z15.h  // 11000001-00101111-10100001-00011110
 // CHECK-INST: bfmax   { z30.h, z31.h }, { z30.h, z31.h }, z15.h
 // CHECK-ENCODING: [0x1e,0xa1,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c12fa11e <unknown>
 
 bfmax   {z0.h, z1.h}, {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-00100000-10110001-00000000
 // CHECK-INST: bfmax   { z0.h, z1.h }, { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0xb1,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120b100 <unknown>
 
 bfmax   {z20.h, z21.h}, {z20.h, z21.h}, {z20.h, z21.h}  // 11000001-00110100-10110001-00010100
 // CHECK-INST: bfmax   { z20.h, z21.h }, { z20.h, z21.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x14,0xb1,0x34,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c134b114 <unknown>
 
 bfmax   {z22.h, z23.h}, {z22.h, z23.h}, {z8.h, z9.h}  // 11000001-00101000-10110001-00010110
 // CHECK-INST: bfmax   { z22.h, z23.h }, { z22.h, z23.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x16,0xb1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128b116 <unknown>
 
 bfmax   {z30.h, z31.h}, {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-00111110-10110001-00011110
 // CHECK-INST: bfmax   { z30.h, z31.h }, { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x1e,0xb1,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13eb11e <unknown>
 
 bfmax   {z0.h - z3.h}, {z0.h - z3.h}, z0.h  // 11000001-00100000-10101001-00000000
 // CHECK-INST: bfmax   { z0.h - z3.h }, { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0xa9,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120a900 <unknown>
 
 bfmax   {z20.h - z23.h}, {z20.h - z23.h}, z5.h  // 11000001-00100101-10101001-00010100
 // CHECK-INST: bfmax   { z20.h - z23.h }, { z20.h - z23.h }, z5.h
 // CHECK-ENCODING: [0x14,0xa9,0x25,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c125a914 <unknown>
 
 bfmax   {z20.h - z23.h}, {z20.h - z23.h}, z8.h  // 11000001-00101000-10101001-00010100
 // CHECK-INST: bfmax   { z20.h - z23.h }, { z20.h - z23.h }, z8.h
 // CHECK-ENCODING: [0x14,0xa9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128a914 <unknown>
 
 bfmax   {z28.h - z31.h}, {z28.h - z31.h}, z15.h  // 11000001-00101111-10101001-00011100
 // CHECK-INST: bfmax   { z28.h - z31.h }, { z28.h - z31.h }, z15.h
 // CHECK-ENCODING: [0x1c,0xa9,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c12fa91c <unknown>
 
 bfmax   {z0.h - z3.h}, {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-00100000-10111001-00000000
 // CHECK-INST: bfmax   { z0.h - z3.h }, { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0xb9,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120b900 <unknown>
 
 bfmax   {z20.h - z23.h}, {z20.h - z23.h}, {z20.h - z23.h}  // 11000001-00110100-10111001-00010100
 // CHECK-INST: bfmax   { z20.h - z23.h }, { z20.h - z23.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x14,0xb9,0x34,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c134b914 <unknown>
 
 bfmax   {z20.h - z23.h}, {z20.h - z23.h}, {z8.h - z11.h}  // 11000001-00101000-10111001-00010100
 // CHECK-INST: bfmax   { z20.h - z23.h }, { z20.h - z23.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x14,0xb9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128b914 <unknown>
 
 bfmax   {z28.h - z31.h}, {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-00111100-10111001-00011100
 // CHECK-INST: bfmax   { z28.h - z31.h }, { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x1c,0xb9,0x3c,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13cb91c <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmaxnm-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfmaxnm-diagnostics.s
similarity index 95%
rename from llvm/test/MC/AArch64/SME2p1/bfmaxnm-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfmaxnm-diagnostics.s
index b1f15112f8e3b..ab837b64b2651 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmaxnm-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfmaxnm-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmaxnm.s b/llvm/test/MC/AArch64/SME2/bfmaxnm.s
similarity index 74%
rename from llvm/test/MC/AArch64/SME2p1/bfmaxnm.s
rename to llvm/test/MC/AArch64/SME2/bfmaxnm.s
index e5b97e89e811b..f61f530548f6f 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmaxnm.s
+++ b/llvm/test/MC/AArch64/SME2/bfmaxnm.s
@@ -1,108 +1,108 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfmaxnm {z0.h, z1.h}, {z0.h, z1.h}, z0.h  // 11000001-00100000-10100001-00100000
 // CHECK-INST: bfmaxnm { z0.h, z1.h }, { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x20,0xa1,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120a120 <unknown>
 
 bfmaxnm {z20.h, z21.h}, {z20.h, z21.h}, z5.h  // 11000001-00100101-10100001-00110100
 // CHECK-INST: bfmaxnm { z20.h, z21.h }, { z20.h, z21.h }, z5.h
 // CHECK-ENCODING: [0x34,0xa1,0x25,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c125a134 <unknown>
 
 bfmaxnm {z22.h, z23.h}, {z22.h, z23.h}, z8.h  // 11000001-00101000-10100001-00110110
 // CHECK-INST: bfmaxnm { z22.h, z23.h }, { z22.h, z23.h }, z8.h
 // CHECK-ENCODING: [0x36,0xa1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128a136 <unknown>
 
 bfmaxnm {z30.h, z31.h}, {z30.h, z31.h}, z15.h  // 11000001-00101111-10100001-00111110
 // CHECK-INST: bfmaxnm { z30.h, z31.h }, { z30.h, z31.h }, z15.h
 // CHECK-ENCODING: [0x3e,0xa1,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c12fa13e <unknown>
 
 bfmaxnm {z0.h, z1.h}, {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-00100000-10110001-00100000
 // CHECK-INST: bfmaxnm { z0.h, z1.h }, { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x20,0xb1,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120b120 <unknown>
 
 bfmaxnm {z20.h, z21.h}, {z20.h, z21.h}, {z20.h, z21.h}  // 11000001-00110100-10110001-00110100
 // CHECK-INST: bfmaxnm { z20.h, z21.h }, { z20.h, z21.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x34,0xb1,0x34,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c134b134 <unknown>
 
 bfmaxnm {z22.h, z23.h}, {z22.h, z23.h}, {z8.h, z9.h}  // 11000001-00101000-10110001-00110110
 // CHECK-INST: bfmaxnm { z22.h, z23.h }, { z22.h, z23.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x36,0xb1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128b136 <unknown>
 
 bfmaxnm {z30.h, z31.h}, {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-00111110-10110001-00111110
 // CHECK-INST: bfmaxnm { z30.h, z31.h }, { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x3e,0xb1,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13eb13e <unknown>
 
 bfmaxnm {z0.h - z3.h}, {z0.h - z3.h}, z0.h  // 11000001-00100000-10101001-00100000
 // CHECK-INST: bfmaxnm { z0.h - z3.h }, { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x20,0xa9,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120a920 <unknown>
 
 bfmaxnm {z20.h - z23.h}, {z20.h - z23.h}, z5.h  // 11000001-00100101-10101001-00110100
 // CHECK-INST: bfmaxnm { z20.h - z23.h }, { z20.h - z23.h }, z5.h
 // CHECK-ENCODING: [0x34,0xa9,0x25,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c125a934 <unknown>
 
 bfmaxnm {z20.h - z23.h}, {z20.h - z23.h}, z8.h  // 11000001-00101000-10101001-00110100
 // CHECK-INST: bfmaxnm { z20.h - z23.h }, { z20.h - z23.h }, z8.h
 // CHECK-ENCODING: [0x34,0xa9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128a934 <unknown>
 
 bfmaxnm {z28.h - z31.h}, {z28.h - z31.h}, z15.h  // 11000001-00101111-10101001-00111100
 // CHECK-INST: bfmaxnm { z28.h - z31.h }, { z28.h - z31.h }, z15.h
 // CHECK-ENCODING: [0x3c,0xa9,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c12fa93c <unknown>
 
 bfmaxnm {z0.h - z3.h}, {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-00100000-10111001-00100000
 // CHECK-INST: bfmaxnm { z0.h - z3.h }, { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x20,0xb9,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120b920 <unknown>
 
 bfmaxnm {z20.h - z23.h}, {z20.h - z23.h}, {z20.h - z23.h}  // 11000001-00110100-10111001-00110100
 // CHECK-INST: bfmaxnm { z20.h - z23.h }, { z20.h - z23.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x34,0xb9,0x34,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c134b934 <unknown>
 
 bfmaxnm {z20.h - z23.h}, {z20.h - z23.h}, {z8.h - z11.h}  // 11000001-00101000-10111001-00110100
 // CHECK-INST: bfmaxnm { z20.h - z23.h }, { z20.h - z23.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x34,0xb9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128b934 <unknown>
 
 bfmaxnm {z28.h - z31.h}, {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-00111100-10111001-00111100
 // CHECK-INST: bfmaxnm { z28.h - z31.h }, { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x3c,0xb9,0x3c,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13cb93c <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmin-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfmin-diagnostics.s
similarity index 95%
rename from llvm/test/MC/AArch64/SME2p1/bfmin-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfmin-diagnostics.s
index 72ee8184cf545..41f10361135e6 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmin-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfmin-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmin.s b/llvm/test/MC/AArch64/SME2/bfmin.s
similarity index 74%
rename from llvm/test/MC/AArch64/SME2p1/bfmin.s
rename to llvm/test/MC/AArch64/SME2/bfmin.s
index 3ba2be5e73944..6612e3c187b29 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmin.s
+++ b/llvm/test/MC/AArch64/SME2/bfmin.s
@@ -1,108 +1,108 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfmin   {z0.h, z1.h}, {z0.h, z1.h}, z0.h  // 11000001-00100000-10100001-00000001
 // CHECK-INST: bfmin   { z0.h, z1.h }, { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x01,0xa1,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120a101 <unknown>
 
 bfmin   {z20.h, z21.h}, {z20.h, z21.h}, z5.h  // 11000001-00100101-10100001-00010101
 // CHECK-INST: bfmin   { z20.h, z21.h }, { z20.h, z21.h }, z5.h
 // CHECK-ENCODING: [0x15,0xa1,0x25,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c125a115 <unknown>
 
 bfmin   {z22.h, z23.h}, {z22.h, z23.h}, z8.h  // 11000001-00101000-10100001-00010111
 // CHECK-INST: bfmin   { z22.h, z23.h }, { z22.h, z23.h }, z8.h
 // CHECK-ENCODING: [0x17,0xa1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128a117 <unknown>
 
 bfmin   {z30.h, z31.h}, {z30.h, z31.h}, z15.h  // 11000001-00101111-10100001-00011111
 // CHECK-INST: bfmin   { z30.h, z31.h }, { z30.h, z31.h }, z15.h
 // CHECK-ENCODING: [0x1f,0xa1,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c12fa11f <unknown>
 
 bfmin   {z0.h, z1.h}, {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-00100000-10110001-00000001
 // CHECK-INST: bfmin   { z0.h, z1.h }, { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0xb1,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120b101 <unknown>
 
 bfmin   {z20.h, z21.h}, {z20.h, z21.h}, {z20.h, z21.h}  // 11000001-00110100-10110001-00010101
 // CHECK-INST: bfmin   { z20.h, z21.h }, { z20.h, z21.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x15,0xb1,0x34,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c134b115 <unknown>
 
 bfmin   {z22.h, z23.h}, {z22.h, z23.h}, {z8.h, z9.h}  // 11000001-00101000-10110001-00010111
 // CHECK-INST: bfmin   { z22.h, z23.h }, { z22.h, z23.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x17,0xb1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128b117 <unknown>
 
 bfmin   {z30.h, z31.h}, {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-00111110-10110001-00011111
 // CHECK-INST: bfmin   { z30.h, z31.h }, { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x1f,0xb1,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13eb11f <unknown>
 
 bfmin   {z0.h - z3.h}, {z0.h - z3.h}, z0.h  // 11000001-00100000-10101001-00000001
 // CHECK-INST: bfmin   { z0.h - z3.h }, { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x01,0xa9,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120a901 <unknown>
 
 bfmin   {z20.h - z23.h}, {z20.h - z23.h}, z5.h  // 11000001-00100101-10101001-00010101
 // CHECK-INST: bfmin   { z20.h - z23.h }, { z20.h - z23.h }, z5.h
 // CHECK-ENCODING: [0x15,0xa9,0x25,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c125a915 <unknown>
 
 bfmin   {z20.h - z23.h}, {z20.h - z23.h}, z8.h  // 11000001-00101000-10101001-00010101
 // CHECK-INST: bfmin   { z20.h - z23.h }, { z20.h - z23.h }, z8.h
 // CHECK-ENCODING: [0x15,0xa9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128a915 <unknown>
 
 bfmin   {z28.h - z31.h}, {z28.h - z31.h}, z15.h  // 11000001-00101111-10101001-00011101
 // CHECK-INST: bfmin   { z28.h - z31.h }, { z28.h - z31.h }, z15.h
 // CHECK-ENCODING: [0x1d,0xa9,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c12fa91d <unknown>
 
 bfmin   {z0.h - z3.h}, {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-00100000-10111001-00000001
 // CHECK-INST: bfmin   { z0.h - z3.h }, { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0xb9,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120b901 <unknown>
 
 bfmin   {z20.h - z23.h}, {z20.h - z23.h}, {z20.h - z23.h}  // 11000001-00110100-10111001-00010101
 // CHECK-INST: bfmin   { z20.h - z23.h }, { z20.h - z23.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x15,0xb9,0x34,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c134b915 <unknown>
 
 bfmin   {z20.h - z23.h}, {z20.h - z23.h}, {z8.h - z11.h}  // 11000001-00101000-10111001-00010101
 // CHECK-INST: bfmin   { z20.h - z23.h }, { z20.h - z23.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x15,0xb9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128b915 <unknown>
 
 bfmin   {z28.h - z31.h}, {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-00111100-10111001-00011101
 // CHECK-INST: bfmin   { z28.h - z31.h }, { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x1d,0xb9,0x3c,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13cb91d <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfminnm-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfminnm-diagnostics.s
similarity index 95%
rename from llvm/test/MC/AArch64/SME2p1/bfminnm-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfminnm-diagnostics.s
index 2d161bf040f80..14485e915ea66 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfminnm-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfminnm-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/bfminnm.s b/llvm/test/MC/AArch64/SME2/bfminnm.s
similarity index 74%
rename from llvm/test/MC/AArch64/SME2p1/bfminnm.s
rename to llvm/test/MC/AArch64/SME2/bfminnm.s
index cfaa3c1c2ad93..4a48a0d455333 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfminnm.s
+++ b/llvm/test/MC/AArch64/SME2/bfminnm.s
@@ -1,113 +1,113 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 
 bfminnm {z0.h, z1.h}, {z0.h, z1.h}, z0.h  // 11000001-00100000-10100001-00100001
 // CHECK-INST: bfminnm { z0.h, z1.h }, { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x21,0xa1,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120a121 <unknown>
 
 bfminnm {z20.h, z21.h}, {z20.h, z21.h}, z5.h  // 11000001-00100101-10100001-00110101
 // CHECK-INST: bfminnm { z20.h, z21.h }, { z20.h, z21.h }, z5.h
 // CHECK-ENCODING: [0x35,0xa1,0x25,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c125a135 <unknown>
 
 bfminnm {z22.h, z23.h}, {z22.h, z23.h}, z8.h  // 11000001-00101000-10100001-00110111
 // CHECK-INST: bfminnm { z22.h, z23.h }, { z22.h, z23.h }, z8.h
 // CHECK-ENCODING: [0x37,0xa1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128a137 <unknown>
 
 bfminnm {z30.h, z31.h}, {z30.h, z31.h}, z15.h  // 11000001-00101111-10100001-00111111
 // CHECK-INST: bfminnm { z30.h, z31.h }, { z30.h, z31.h }, z15.h
 // CHECK-ENCODING: [0x3f,0xa1,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c12fa13f <unknown>
 
 
 bfminnm {z0.h, z1.h}, {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-00100000-10110001-00100001
 // CHECK-INST: bfminnm { z0.h, z1.h }, { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x21,0xb1,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120b121 <unknown>
 
 bfminnm {z20.h, z21.h}, {z20.h, z21.h}, {z20.h, z21.h}  // 11000001-00110100-10110001-00110101
 // CHECK-INST: bfminnm { z20.h, z21.h }, { z20.h, z21.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x35,0xb1,0x34,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c134b135 <unknown>
 
 bfminnm {z22.h, z23.h}, {z22.h, z23.h}, {z8.h, z9.h}  // 11000001-00101000-10110001-00110111
 // CHECK-INST: bfminnm { z22.h, z23.h }, { z22.h, z23.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x37,0xb1,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128b137 <unknown>
 
 bfminnm {z30.h, z31.h}, {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-00111110-10110001-00111111
 // CHECK-INST: bfminnm { z30.h, z31.h }, { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x3f,0xb1,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13eb13f <unknown>
 
 
 bfminnm {z0.h - z3.h}, {z0.h - z3.h}, z0.h  // 11000001-00100000-10101001-00100001
 // CHECK-INST: bfminnm { z0.h - z3.h }, { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x21,0xa9,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120a921 <unknown>
 
 bfminnm {z20.h - z23.h}, {z20.h - z23.h}, z5.h  // 11000001-00100101-10101001-00110101
 // CHECK-INST: bfminnm { z20.h - z23.h }, { z20.h - z23.h }, z5.h
 // CHECK-ENCODING: [0x35,0xa9,0x25,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c125a935 <unknown>
 
 bfminnm {z20.h - z23.h}, {z20.h - z23.h}, z8.h  // 11000001-00101000-10101001-00110101
 // CHECK-INST: bfminnm { z20.h - z23.h }, { z20.h - z23.h }, z8.h
 // CHECK-ENCODING: [0x35,0xa9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128a935 <unknown>
 
 bfminnm {z28.h - z31.h}, {z28.h - z31.h}, z15.h  // 11000001-00101111-10101001-00111101
 // CHECK-INST: bfminnm { z28.h - z31.h }, { z28.h - z31.h }, z15.h
 // CHECK-ENCODING: [0x3d,0xa9,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c12fa93d <unknown>
 
 
 bfminnm {z0.h - z3.h}, {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-00100000-10111001-00100001
 // CHECK-INST: bfminnm { z0.h - z3.h }, { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x21,0xb9,0x20,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c120b921 <unknown>
 
 bfminnm {z20.h - z23.h}, {z20.h - z23.h}, {z20.h - z23.h}  // 11000001-00110100-10111001-00110101
 // CHECK-INST: bfminnm { z20.h - z23.h }, { z20.h - z23.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x35,0xb9,0x34,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c134b935 <unknown>
 
 bfminnm {z20.h - z23.h}, {z20.h - z23.h}, {z8.h - z11.h}  // 11000001-00101000-10111001-00110101
 // CHECK-INST: bfminnm { z20.h - z23.h }, { z20.h - z23.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x35,0xb9,0x28,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c128b935 <unknown>
 
 bfminnm {z28.h - z31.h}, {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-00111100-10111001-00111101
 // CHECK-INST: bfminnm { z28.h - z31.h }, { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x3d,0xb9,0x3c,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c13cb93d <unknown>
 
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmla-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfmla-diagnostics.s
similarity index 97%
rename from llvm/test/MC/AArch64/SME2p1/bfmla-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfmla-diagnostics.s
index 42bb35da9dcfe..efd26244d91f0 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfmla-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmla.s b/llvm/test/MC/AArch64/SME2/bfmla.s
similarity index 81%
rename from llvm/test/MC/AArch64/SME2p1/bfmla.s
rename to llvm/test/MC/AArch64/SME2/bfmla.s
index 4c053fea0ff1e..75ccccc919843 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmla.s
+++ b/llvm/test/MC/AArch64/SME2/bfmla.s
@@ -1,876 +1,876 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfmla   za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h  // 11000001-01100000-00011100-00000000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x60,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1601c00 <unknown>
 
 bfmla   za.h[w8, 0], {z0.h - z1.h}, z0.h  // 11000001-01100000-00011100-00000000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x60,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1601c00 <unknown>
 
 bfmla   za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h  // 11000001-01100101-01011101-01000101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x65,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1655d45 <unknown>
 
 bfmla   za.h[w10, 5], {z10.h - z11.h}, z5.h  // 11000001-01100101-01011101-01000101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x65,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1655d45 <unknown>
 
 bfmla   za.h[w11, 7, vgx2], {z13.h, z14.h}, z8.h  // 11000001-01101000-01111101-10100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x68,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1687da7 <unknown>
 
 bfmla   za.h[w11, 7], {z13.h - z14.h}, z8.h  // 11000001-01101000-01111101-10100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x68,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1687da7 <unknown>
 
 bfmla   za.h[w11, 7, vgx2], {z31.h, z0.h}, z15.h  // 11000001-01101111-01111111-11100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x6f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16f7fe7 <unknown>
 
 bfmla   za.h[w11, 7], {z31.h - z0.h}, z15.h  // 11000001-01101111-01111111-11100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x6f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16f7fe7 <unknown>
 
 bfmla   za.h[w8, 5, vgx2], {z17.h, z18.h}, z0.h  // 11000001-01100000-00011110-00100101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x60,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1601e25 <unknown>
 
 bfmla   za.h[w8, 5], {z17.h - z18.h}, z0.h  // 11000001-01100000-00011110-00100101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x60,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1601e25 <unknown>
 
 bfmla   za.h[w8, 1, vgx2], {z1.h, z2.h}, z14.h  // 11000001-01101110-00011100-00100001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x6e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16e1c21 <unknown>
 
 bfmla   za.h[w8, 1], {z1.h - z2.h}, z14.h  // 11000001-01101110-00011100-00100001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x6e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16e1c21 <unknown>
 
 bfmla   za.h[w10, 0, vgx2], {z19.h, z20.h}, z4.h  // 11000001-01100100-01011110-01100000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x64,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1645e60 <unknown>
 
 bfmla   za.h[w10, 0], {z19.h - z20.h}, z4.h  // 11000001-01100100-01011110-01100000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x64,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1645e60 <unknown>
 
 bfmla   za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h  // 11000001-01100010-00011101-10000000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x62,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1621d80 <unknown>
 
 bfmla   za.h[w8, 0], {z12.h - z13.h}, z2.h  // 11000001-01100010-00011101-10000000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x62,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1621d80 <unknown>
 
 bfmla   za.h[w10, 1, vgx2], {z1.h, z2.h}, z10.h  // 11000001-01101010-01011100-00100001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x6a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16a5c21 <unknown>
 
 bfmla   za.h[w10, 1], {z1.h - z2.h}, z10.h  // 11000001-01101010-01011100-00100001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x6a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16a5c21 <unknown>
 
 bfmla   za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h  // 11000001-01101110-00011110-11000101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x6e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16e1ec5 <unknown>
 
 bfmla   za.h[w8, 5], {z22.h - z23.h}, z14.h  // 11000001-01101110-00011110-11000101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x6e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16e1ec5 <unknown>
 
 bfmla   za.h[w11, 2, vgx2], {z9.h, z10.h}, z1.h  // 11000001-01100001-01111101-00100010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x61,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1617d22 <unknown>
 
 bfmla   za.h[w11, 2], {z9.h - z10.h}, z1.h  // 11000001-01100001-01111101-00100010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x61,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1617d22 <unknown>
 
 bfmla   za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h  // 11000001-01101011-00111101-10000111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x6b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16b3d87 <unknown>
 
 bfmla   za.h[w9, 7], {z12.h - z13.h}, z11.h  // 11000001-01101011-00111101-10000111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x6b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16b3d87 <unknown>
 
 bfmla   za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h[0]  // 11000001-00010000-00010000-00100000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x20,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1101020 <unknown>
 
 bfmla   za.h[w8, 0], {z0.h - z1.h}, z0.h[0]  // 11000001-00010000-00010000-00100000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x20,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1101020 <unknown>
 
 bfmla   za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h[2]  // 11000001-00010101-01010101-01100101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x65,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1155565 <unknown>
 
 bfmla   za.h[w10, 5], {z10.h - z11.h}, z5.h[2]  // 11000001-00010101-01010101-01100101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x65,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1155565 <unknown>
 
 bfmla   za.h[w11, 7, vgx2], {z12.h, z13.h}, z8.h[6]  // 11000001-00011000-01111101-10100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0xa7,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1187da7 <unknown>
 
 bfmla   za.h[w11, 7], {z12.h - z13.h}, z8.h[6]  // 11000001-00011000-01111101-10100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0xa7,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1187da7 <unknown>
 
 bfmla   za.h[w11, 7, vgx2], {z30.h, z31.h}, z15.h[7]  // 11000001-00011111-01111111-11101111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xef,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11f7fef <unknown>
 
 bfmla   za.h[w11, 7], {z30.h - z31.h}, z15.h[7]  // 11000001-00011111-01111111-11101111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xef,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11f7fef <unknown>
 
 bfmla   za.h[w8, 5, vgx2], {z16.h, z17.h}, z0.h[6]  // 11000001-00010000-00011110-00100101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x25,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1101e25 <unknown>
 
 bfmla   za.h[w8, 5], {z16.h - z17.h}, z0.h[6]  // 11000001-00010000-00011110-00100101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x25,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1101e25 <unknown>
 
 bfmla   za.h[w8, 1, vgx2], {z0.h, z1.h}, z14.h[2]  // 11000001-00011110-00010100-00100001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x21,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e1421 <unknown>
 
 bfmla   za.h[w8, 1], {z0.h - z1.h}, z14.h[2]  // 11000001-00011110-00010100-00100001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x21,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e1421 <unknown>
 
 bfmla   za.h[w10, 0, vgx2], {z18.h, z19.h}, z4.h[3]  // 11000001-00010100-01010110-01101000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x68,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1145668 <unknown>
 
 bfmla   za.h[w10, 0], {z18.h - z19.h}, z4.h[3]  // 11000001-00010100-01010110-01101000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x68,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1145668 <unknown>
 
 bfmla   za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h[4]  // 11000001-00010010-00011001-10100000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0xa0,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11219a0 <unknown>
 
 bfmla   za.h[w8, 0], {z12.h - z13.h}, z2.h[4]  // 11000001-00010010-00011001-10100000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0xa0,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11219a0 <unknown>
 
 bfmla   za.h[w10, 1, vgx2], {z0.h, z1.h}, z10.h[4]  // 11000001-00011010-01011000-00100001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x21,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11a5821 <unknown>
 
 bfmla   za.h[w10, 1], {z0.h - z1.h}, z10.h[4]  // 11000001-00011010-01011000-00100001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x21,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11a5821 <unknown>
 
 bfmla   za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h[5]  // 11000001-00011110-00011010-11101101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xed,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e1aed <unknown>
 
 bfmla   za.h[w8, 5], {z22.h - z23.h}, z14.h[5]  // 11000001-00011110-00011010-11101101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xed,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e1aed <unknown>
 
 bfmla   za.h[w11, 2, vgx2], {z8.h, z9.h}, z1.h[2]  // 11000001-00010001-01110101-00100010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x22,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1117522 <unknown>
 
 bfmla   za.h[w11, 2], {z8.h - z9.h}, z1.h[2]  // 11000001-00010001-01110101-00100010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x22,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1117522 <unknown>
 
 bfmla   za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h[4]  // 11000001-00011011-00111001-10100111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0xa7,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11b39a7 <unknown>
 
 bfmla   za.h[w9, 7], {z12.h - z13.h}, z11.h[4]  // 11000001-00011011-00111001-10100111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0xa7,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11b39a7 <unknown>
 
 bfmla   za.h[w8, 0, vgx2], {z0.h, z1.h}, {z0.h, z1.h}  // 11000001, 11100000-00010000-00001000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x10,0xe0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e01008 <unknown>
 
 bfmla   za.h[w8, 0], {z0.h - z1.h}, {z0.h - z1.h}  // 11000001-11100000-00010000-00001000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x10,0xe0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e01008 <unknown>
 
 bfmla   za.h[w10, 5, vgx2], {z10.h, z11.h}, {z20.h, z21.h}  // 11000001, 11110100-01010001-01001101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x4d,0x51,0xf4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f4514d <unknown>
 
 bfmla   za.h[w10, 5], {z10.h - z11.h}, {z20.h - z21.h}  // 11000001-11110100-01010001-01001101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x4d,0x51,0xf4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f4514d <unknown>
 
 bfmla   za.h[w11, 7, vgx2], {z12.h, z13.h}, {z8.h, z9.h}  // 11000001, 11101000-01110001-10001111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x8f,0x71,0xe8,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e8718f <unknown>
 
 bfmla   za.h[w11, 7], {z12.h - z13.h}, {z8.h - z9.h}  // 11000001-11101000-01110001-10001111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x8f,0x71,0xe8,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e8718f <unknown>
 
 bfmla   za.h[w11, 7, vgx2], {z30.h, z31.h}, {z30.h, z31.h}  // 11000001, 11111110-01110011-11001111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x73,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe73cf <unknown>
 
 bfmla   za.h[w11, 7], {z30.h - z31.h}, {z30.h - z31.h}  // 11000001-11111110-01110011-11001111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x73,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe73cf <unknown>
 
 bfmla   za.h[w8, 5, vgx2], {z16.h, z17.h}, {z16.h, z17.h}  // 11000001, 11110000-00010010-00001101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x12,0xf0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f0120d <unknown>
 
 bfmla   za.h[w8, 5], {z16.h - z17.h}, {z16.h - z17.h}  // 11000001-11110000-00010010-00001101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x12,0xf0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f0120d <unknown>
 
 bfmla   za.h[w8, 1, vgx2], {z0.h, z1.h}, {z30.h, z31.h}  // 11000001, 11111110-00010000-00001001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe1009 <unknown>
 
 bfmla   za.h[w8, 1], {z0.h - z1.h}, {z30.h - z31.h}  // 11000001-11111110-00010000-00001001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe1009 <unknown>
 
 bfmla   za.h[w10, 0, vgx2], {z18.h, z19.h}, {z20.h, z21.h}  // 11000001, 11110100-01010010-01001000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x48,0x52,0xf4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f45248 <unknown>
 
 bfmla   za.h[w10, 0], {z18.h - z19.h}, {z20.h - z21.h}  // 11000001-11110100-01010010-01001000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x48,0x52,0xf4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f45248 <unknown>
 
 bfmla   za.h[w8, 0, vgx2], {z12.h, z13.h}, {z2.h, z3.h}  // 11000001, 11100010-00010001-10001000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xe2,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e21188 <unknown>
 
 bfmla   za.h[w8, 0], {z12.h - z13.h}, {z2.h - z3.h}  // 11000001-11100010-00010001-10001000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xe2,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e21188 <unknown>
 
 bfmla   za.h[w10, 1, vgx2], {z0.h, z1.h}, {z26.h, z27.h}  // 11000001, 11111010-01010000-00001001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xfa,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fa5009 <unknown>
 
 bfmla   za.h[w10, 1], {z0.h - z1.h}, {z26.h - z27.h}  // 11000001-11111010-01010000-00001001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xfa,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fa5009 <unknown>
 
 bfmla   za.h[w8, 5, vgx2], {z22.h, z23.h}, {z30.h, z31.h}  // 11000001, 11111110-00010010-11001101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcd,0x12,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe12cd <unknown>
 
 bfmla   za.h[w8, 5], {z22.h - z23.h}, {z30.h - z31.h}  // 11000001-11111110-00010010-11001101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcd,0x12,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe12cd <unknown>
 
 bfmla   za.h[w11, 2, vgx2], {z8.h, z9.h}, {z0.h, z1.h}  // 11000001, 11100000-01110001-00001010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x0a,0x71,0xe0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e0710a <unknown>
 
 bfmla   za.h[w11, 2], {z8.h - z9.h}, {z0.h - z1.h}  // 11000001-11100000-01110001-00001010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x0a,0x71,0xe0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e0710a <unknown>
 
 bfmla   za.h[w9, 7, vgx2], {z12.h, z13.h}, {z10.h, z11.h}  // 11000001, 11101010-00110001-10001111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xea,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1ea318f <unknown>
 
 bfmla   za.h[w9, 7], {z12.h - z13.h}, {z10.h - z11.h}  // 11000001-11101010-00110001-10001111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xea,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1ea318f <unknown>
 
 bfmla   za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h  // 11000001-01110000-00011100-00000000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x70,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1701c00 <unknown>
 
 bfmla   za.h[w8, 0], {z0.h - z3.h}, z0.h  // 11000001-01110000-00011100-00000000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x70,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1701c00 <unknown>
 
 bfmla   za.h[w10, 5, vgx4], {z10.h - z13.h}, z5.h  // 11000001-01110101-01011101-01000101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x75,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1755d45 <unknown>
 
 bfmla   za.h[w10, 5], {z10.h - z13.h}, z5.h  // 11000001-01110101-01011101-01000101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x75,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1755d45 <unknown>
 
 bfmla   za.h[w11, 7, vgx4], {z13.h - z16.h}, z8.h  // 11000001-01111000-01111101-10100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x78,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1787da7 <unknown>
 
 bfmla   za.h[w11, 7], {z13.h - z16.h}, z8.h  // 11000001-01111000-01111101-10100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x78,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1787da7 <unknown>
 
 bfmla   za.h[w11, 7, vgx4], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-01111111-01111111-11100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x7f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17f7fe7 <unknown>
 
 bfmla   za.h[w11, 7], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-01111111-01111111-11100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x7f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17f7fe7 <unknown>
 
 bfmla   za.h[w8, 5, vgx4], {z17.h - z20.h}, z0.h  // 11000001-01110000-00011110-00100101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x70,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1701e25 <unknown>
 
 bfmla   za.h[w8, 5], {z17.h - z20.h}, z0.h  // 11000001-01110000-00011110-00100101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x70,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1701e25 <unknown>
 
 bfmla   za.h[w8, 1, vgx4], {z1.h - z4.h}, z14.h  // 11000001-01111110-00011100-00100001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x7e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17e1c21 <unknown>
 
 bfmla   za.h[w8, 1], {z1.h - z4.h}, z14.h  // 11000001-01111110-00011100-00100001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x7e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17e1c21 <unknown>
 
 bfmla   za.h[w10, 0, vgx4], {z19.h - z22.h}, z4.h  // 11000001-01110100-01011110-01100000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x74,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1745e60 <unknown>
 
 bfmla   za.h[w10, 0], {z19.h - z22.h}, z4.h  // 11000001-01110100-01011110-01100000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x74,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1745e60 <unknown>
 
 bfmla   za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h  // 11000001-01110010-00011101-10000000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x72,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1721d80 <unknown>
 
 bfmla   za.h[w8, 0], {z12.h - z15.h}, z2.h  // 11000001-01110010-00011101-10000000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x72,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1721d80 <unknown>
 
 bfmla   za.h[w10, 1, vgx4], {z1.h - z4.h}, z10.h  // 11000001-01111010-01011100-00100001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x7a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17a5c21 <unknown>
 
 bfmla   za.h[w10, 1], {z1.h - z4.h}, z10.h  // 11000001-01111010-01011100-00100001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x7a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17a5c21 <unknown>
 
 bfmla   za.h[w8, 5, vgx4], {z22.h - z25.h}, z14.h  // 11000001-01111110-00011110-11000101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x7e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17e1ec5 <unknown>
 
 bfmla   za.h[w8, 5], {z22.h - z25.h}, z14.h  // 11000001-01111110-00011110-11000101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x7e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17e1ec5 <unknown>
 
 bfmla   za.h[w11, 2, vgx4], {z9.h - z12.h}, z1.h  // 11000001-01110001-01111101-00100010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x71,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1717d22 <unknown>
 
 bfmla   za.h[w11, 2], {z9.h - z12.h}, z1.h  // 11000001-01110001-01111101-00100010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x71,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1717d22 <unknown>
 
 bfmla   za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h  // 11000001-01111011-00111101-10000111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x7b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17b3d87 <unknown>
 
 bfmla   za.h[w9, 7], {z12.h - z15.h}, z11.h  // 11000001-01111011-00111101-10000111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x7b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17b3d87 <unknown>
 
 bfmla   za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00100000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x20,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1109020 <unknown>
 
 bfmla   za.h[w8, 0], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00100000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x20,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1109020 <unknown>
 
 bfmla   za.h[w10, 5, vgx4], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00100101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x25,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c115d525 <unknown>
 
 bfmla   za.h[w10, 5], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00100101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x25,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c115d525 <unknown>
 
 bfmla   za.h[w11, 7, vgx4], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0xa7,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c118fda7 <unknown>
 
 bfmla   za.h[w11, 7], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10100111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0xa7,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c118fda7 <unknown>
 
 bfmla   za.h[w11, 7, vgx4], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10101111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xaf,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11fffaf <unknown>
 
 bfmla   za.h[w11, 7], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10101111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xaf,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11fffaf <unknown>
 
 bfmla   za.h[w8, 5, vgx4], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00100101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x25,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1109e25 <unknown>
 
 bfmla   za.h[w8, 5], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00100101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x25,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1109e25 <unknown>
 
 bfmla   za.h[w8, 1, vgx4], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00100001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x21,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e9421 <unknown>
 
 bfmla   za.h[w8, 1], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00100001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x21,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e9421 <unknown>
 
 bfmla   za.h[w10, 0, vgx4], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00101000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x28,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c114d628 <unknown>
 
 bfmla   za.h[w10, 0], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00101000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x28,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c114d628 <unknown>
 
 bfmla   za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10100000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0xa0,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11299a0 <unknown>
 
 bfmla   za.h[w8, 0], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10100000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0xa0,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11299a0 <unknown>
 
 bfmla   za.h[w10, 1, vgx4], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00100001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x21,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11ad821 <unknown>
 
 bfmla   za.h[w10, 1], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00100001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x21,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11ad821 <unknown>
 
 bfmla   za.h[w8, 5, vgx4], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10101101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xad,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e9aad <unknown>
 
 bfmla   za.h[w8, 5], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10101101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xad,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e9aad <unknown>
 
 bfmla   za.h[w11, 2, vgx4], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00100010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x22,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c111f522 <unknown>
 
 bfmla   za.h[w11, 2], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00100010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x22,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c111f522 <unknown>
 
 bfmla   za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10100111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0xa7,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11bb9a7 <unknown>
 
 bfmla   za.h[w9, 7], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10100111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0xa7,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11bb9a7 <unknown>
 
 bfmla   za.h[w8, 0, vgx4], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-11100001-00010000-00001000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x10,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e11008 <unknown>
 
 bfmla   za.h[w8, 0], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-11100001-00010000-00001000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x10,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e11008 <unknown>
 
 bfmla   za.h[w10, 5, vgx4], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-11110101-01010001-00001101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x0d,0x51,0xf5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f5510d <unknown>
 
 bfmla   za.h[w10, 5], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-11110101-01010001-00001101
 // CHECK-INST: bfmla   za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x0d,0x51,0xf5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f5510d <unknown>
 
 bfmla   za.h[w11, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-11101001-01110001-10001111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x71,0xe9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e9718f <unknown>
 
 bfmla   za.h[w11, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-11101001-01110001-10001111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x71,0xe9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e9718f <unknown>
 
 bfmla   za.h[w11, 7, vgx4], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-11111101-01110011-10001111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x73,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd738f <unknown>
 
 bfmla   za.h[w11, 7], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-11111101-01110011-10001111
 // CHECK-INST: bfmla   za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x73,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd738f <unknown>
 
 bfmla   za.h[w8, 5, vgx4], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-11110001-00010010-00001101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x12,0xf1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f1120d <unknown>
 
 bfmla   za.h[w8, 5], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-11110001-00010010-00001101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x12,0xf1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f1120d <unknown>
 
 bfmla   za.h[w8, 1, vgx4], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-11111101-00010000-00001001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd1009 <unknown>
 
 bfmla   za.h[w8, 1], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-11111101-00010000-00001001
 // CHECK-INST: bfmla   za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd1009 <unknown>
 
 bfmla   za.h[w10, 0, vgx4], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-11110101-01010010-00001000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x08,0x52,0xf5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f55208 <unknown>
 
 bfmla   za.h[w10, 0], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-11110101-01010010-00001000
 // CHECK-INST: bfmla   za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x08,0x52,0xf5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f55208 <unknown>
 
 bfmla   za.h[w8, 0, vgx4], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-11100001-00010001-10001000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e11188 <unknown>
 
 bfmla   za.h[w8, 0], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-11100001-00010001-10001000
 // CHECK-INST: bfmla   za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e11188 <unknown>
 
 bfmla   za.h[w10, 1, vgx4], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-11111001-01010000-00001001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xf9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f95009 <unknown>
 
 bfmla   za.h[w10, 1], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-11111001-01010000-00001001
 // CHECK-INST: bfmla   za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xf9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f95009 <unknown>
 
 bfmla   za.h[w8, 5, vgx4], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-11111101-00010010-10001101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8d,0x12,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd128d <unknown>
 
 bfmla   za.h[w8, 5], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-11111101-00010010-10001101
 // CHECK-INST: bfmla   za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8d,0x12,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd128d <unknown>
 
 bfmla   za.h[w11, 2, vgx4], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-11100001-01110001-00001010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x0a,0x71,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e1710a <unknown>
 
 bfmla   za.h[w11, 2], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-11100001-01110001-00001010
 // CHECK-INST: bfmla   za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x0a,0x71,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e1710a <unknown>
 
 bfmla   za.h[w9, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-11101001-00110001-10001111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xe9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e9318f <unknown>
 
 bfmla   za.h[w9, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-11101001-00110001-10001111
 // CHECK-INST: bfmla   za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xe9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e9318f <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmls-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfmls-diagnostics.s
similarity index 97%
rename from llvm/test/MC/AArch64/SME2p1/bfmls-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfmls-diagnostics.s
index 4174e244d1a41..d6e77137f9a28 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmls-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfmls-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmls.s b/llvm/test/MC/AArch64/SME2/bfmls.s
similarity index 81%
rename from llvm/test/MC/AArch64/SME2p1/bfmls.s
rename to llvm/test/MC/AArch64/SME2/bfmls.s
index 631da1e5058da..8d5bdc48025af 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmls.s
+++ b/llvm/test/MC/AArch64/SME2/bfmls.s
@@ -1,876 +1,876 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfmls   za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h  // 11000001-01100000-00011100-00001000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x60,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1601c08 <unknown>
 
 bfmls   za.h[w8, 0], {z0.h - z1.h}, z0.h  // 11000001-01100000-00011100-00001000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x60,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1601c08 <unknown>
 
 bfmls   za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h  // 11000001-01100101-01011101-01001101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x65,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1655d4d <unknown>
 
 bfmls   za.h[w10, 5], {z10.h - z11.h}, z5.h  // 11000001-01100101-01011101-01001101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x65,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1655d4d <unknown>
 
 bfmls   za.h[w11, 7, vgx2], {z13.h, z14.h}, z8.h  // 11000001-01101000-01111101-10101111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x68,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1687daf <unknown>
 
 bfmls   za.h[w11, 7], {z13.h - z14.h}, z8.h  // 11000001-01101000-01111101-10101111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x68,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1687daf <unknown>
 
 bfmls   za.h[w11, 7, vgx2], {z31.h, z0.h}, z15.h  // 11000001-01101111-01111111-11101111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x6f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16f7fef <unknown>
 
 bfmls   za.h[w11, 7], {z31.h - z0.h}, z15.h  // 11000001-01101111-01111111-11101111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x6f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16f7fef <unknown>
 
 bfmls   za.h[w8, 5, vgx2], {z17.h, z18.h}, z0.h  // 11000001-01100000-00011110-00101101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x60,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1601e2d <unknown>
 
 bfmls   za.h[w8, 5], {z17.h - z18.h}, z0.h  // 11000001-01100000-00011110-00101101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x60,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1601e2d <unknown>
 
 bfmls   za.h[w8, 1, vgx2], {z1.h, z2.h}, z14.h  // 11000001-01101110-00011100-00101001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x6e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16e1c29 <unknown>
 
 bfmls   za.h[w8, 1], {z1.h - z2.h}, z14.h  // 11000001-01101110-00011100-00101001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x6e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16e1c29 <unknown>
 
 bfmls   za.h[w10, 0, vgx2], {z19.h, z20.h}, z4.h  // 11000001-01100100-01011110-01101000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x64,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1645e68 <unknown>
 
 bfmls   za.h[w10, 0], {z19.h - z20.h}, z4.h  // 11000001-01100100-01011110-01101000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x64,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1645e68 <unknown>
 
 bfmls   za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h  // 11000001-01100010-00011101-10001000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x62,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1621d88 <unknown>
 
 bfmls   za.h[w8, 0], {z12.h - z13.h}, z2.h  // 11000001-01100010-00011101-10001000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x62,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1621d88 <unknown>
 
 bfmls   za.h[w10, 1, vgx2], {z1.h, z2.h}, z10.h  // 11000001-01101010-01011100-00101001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x6a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16a5c29 <unknown>
 
 bfmls   za.h[w10, 1], {z1.h - z2.h}, z10.h  // 11000001-01101010-01011100-00101001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x6a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16a5c29 <unknown>
 
 bfmls   za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h  // 11000001-01101110-00011110-11001101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x6e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16e1ecd <unknown>
 
 bfmls   za.h[w8, 5], {z22.h - z23.h}, z14.h  // 11000001-01101110-00011110-11001101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x6e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16e1ecd <unknown>
 
 bfmls   za.h[w11, 2, vgx2], {z9.h, z10.h}, z1.h  // 11000001-01100001-01111101-00101010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x61,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1617d2a <unknown>
 
 bfmls   za.h[w11, 2], {z9.h - z10.h}, z1.h  // 11000001-01100001-01111101-00101010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x61,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1617d2a <unknown>
 
 bfmls   za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h  // 11000001-01101011-00111101-10001111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x6b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16b3d8f <unknown>
 
 bfmls   za.h[w9, 7], {z12.h - z13.h}, z11.h  // 11000001-01101011-00111101-10001111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x6b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c16b3d8f <unknown>
 
 bfmls   za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h[0]  // 11000001-00010000-00010000-00110000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x30,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1101030 <unknown>
 
 bfmls   za.h[w8, 0], {z0.h - z1.h}, z0.h[0]  // 11000001-00010000-00010000-00110000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x30,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1101030 <unknown>
 
 bfmls   za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h[2]  // 11000001-00010101-01010101-01110101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x75,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1155575 <unknown>
 
 bfmls   za.h[w10, 5], {z10.h - z11.h}, z5.h[2]  // 11000001-00010101-01010101-01110101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x75,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1155575 <unknown>
 
 bfmls   za.h[w11, 7, vgx2], {z12.h, z13.h}, z8.h[6]  // 11000001-00011000-01111101-10110111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0xb7,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1187db7 <unknown>
 
 bfmls   za.h[w11, 7], {z12.h - z13.h}, z8.h[6]  // 11000001-00011000-01111101-10110111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0xb7,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1187db7 <unknown>
 
 bfmls   za.h[w11, 7, vgx2], {z30.h, z31.h}, z15.h[7]  // 11000001-00011111-01111111-11111111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xff,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11f7fff <unknown>
 
 bfmls   za.h[w11, 7], {z30.h - z31.h}, z15.h[7]  // 11000001-00011111-01111111-11111111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xff,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11f7fff <unknown>
 
 bfmls   za.h[w8, 5, vgx2], {z16.h, z17.h}, z0.h[6]  // 11000001-00010000-00011110-00110101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x35,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1101e35 <unknown>
 
 bfmls   za.h[w8, 5], {z16.h - z17.h}, z0.h[6]  // 11000001-00010000-00011110-00110101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x35,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1101e35 <unknown>
 
 bfmls   za.h[w8, 1, vgx2], {z0.h, z1.h}, z14.h[2]  // 11000001-00011110-00010100-00110001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x31,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e1431 <unknown>
 
 bfmls   za.h[w8, 1], {z0.h - z1.h}, z14.h[2]  // 11000001-00011110-00010100-00110001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x31,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e1431 <unknown>
 
 bfmls   za.h[w10, 0, vgx2], {z18.h, z19.h}, z4.h[3]  // 11000001-00010100-01010110-01111000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x78,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1145678 <unknown>
 
 bfmls   za.h[w10, 0], {z18.h - z19.h}, z4.h[3]  // 11000001-00010100-01010110-01111000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x78,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1145678 <unknown>
 
 bfmls   za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h[4]  // 11000001-00010010-00011001-10110000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0xb0,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11219b0 <unknown>
 
 bfmls   za.h[w8, 0], {z12.h - z13.h}, z2.h[4]  // 11000001-00010010-00011001-10110000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0xb0,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11219b0 <unknown>
 
 bfmls   za.h[w10, 1, vgx2], {z0.h, z1.h}, z10.h[4]  // 11000001-00011010-01011000-00110001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x31,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11a5831 <unknown>
 
 bfmls   za.h[w10, 1], {z0.h - z1.h}, z10.h[4]  // 11000001-00011010-01011000-00110001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x31,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11a5831 <unknown>
 
 bfmls   za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h[5]  // 11000001-00011110-00011010-11111101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xfd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e1afd <unknown>
 
 bfmls   za.h[w8, 5], {z22.h - z23.h}, z14.h[5]  // 11000001-00011110-00011010-11111101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xfd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e1afd <unknown>
 
 bfmls   za.h[w11, 2, vgx2], {z8.h, z9.h}, z1.h[2]  // 11000001-00010001-01110101-00110010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x32,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1117532 <unknown>
 
 bfmls   za.h[w11, 2], {z8.h - z9.h}, z1.h[2]  // 11000001-00010001-01110101-00110010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x32,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1117532 <unknown>
 
 bfmls   za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h[4]  // 11000001-00011011-00111001-10110111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0xb7,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11b39b7 <unknown>
 
 bfmls   za.h[w9, 7], {z12.h - z13.h}, z11.h[4]  // 11000001-00011011-00111001-10110111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0xb7,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11b39b7 <unknown>
 
 bfmls   za.h[w8, 0, vgx2], {z0.h, z1.h}, {z0.h, z1.h}  // 11000001, 11100000-00010000-00011000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x18,0x10,0xe0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e01018 <unknown>
 
 bfmls   za.h[w8, 0], {z0.h - z1.h}, {z0.h - z1.h}  // 11000001-11100000-00010000-00011000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x18,0x10,0xe0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e01018 <unknown>
 
 bfmls   za.h[w10, 5, vgx2], {z10.h, z11.h}, {z20.h, z21.h}  // 11000001, 11110100-01010001-01011101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x5d,0x51,0xf4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f4515d <unknown>
 
 bfmls   za.h[w10, 5], {z10.h - z11.h}, {z20.h - z21.h}  // 11000001-11110100-01010001-01011101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x5d,0x51,0xf4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f4515d <unknown>
 
 bfmls   za.h[w11, 7, vgx2], {z12.h, z13.h}, {z8.h, z9.h}  // 11000001, 11101000-01110001-10011111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x9f,0x71,0xe8,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e8719f <unknown>
 
 bfmls   za.h[w11, 7], {z12.h - z13.h}, {z8.h - z9.h}  // 11000001-11101000-01110001-10011111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x9f,0x71,0xe8,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e8719f <unknown>
 
 bfmls   za.h[w11, 7, vgx2], {z30.h, z31.h}, {z30.h, z31.h}  // 11000001, 11111110-01110011-11011111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdf,0x73,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe73df <unknown>
 
 bfmls   za.h[w11, 7], {z30.h - z31.h}, {z30.h - z31.h}  // 11000001-11111110-01110011-11011111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdf,0x73,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe73df <unknown>
 
 bfmls   za.h[w8, 5, vgx2], {z16.h, z17.h}, {z16.h, z17.h}  // 11000001, 11110000-00010010-00011101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x1d,0x12,0xf0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f0121d <unknown>
 
 bfmls   za.h[w8, 5], {z16.h - z17.h}, {z16.h - z17.h}  // 11000001-11110000-00010010-00011101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x1d,0x12,0xf0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f0121d <unknown>
 
 bfmls   za.h[w8, 1, vgx2], {z0.h, z1.h}, {z30.h, z31.h}  // 11000001, 11111110-00010000-00011001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe1019 <unknown>
 
 bfmls   za.h[w8, 1], {z0.h - z1.h}, {z30.h - z31.h}  // 11000001-11111110-00010000-00011001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe1019 <unknown>
 
 bfmls   za.h[w10, 0, vgx2], {z18.h, z19.h}, {z20.h, z21.h}  // 11000001, 11110100-01010010-01011000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x58,0x52,0xf4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f45258 <unknown>
 
 bfmls   za.h[w10, 0], {z18.h - z19.h}, {z20.h - z21.h}  // 11000001-11110100-01010010-01011000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x58,0x52,0xf4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f45258 <unknown>
 
 bfmls   za.h[w8, 0, vgx2], {z12.h, z13.h}, {z2.h, z3.h}  // 11000001, 11100010-00010001-10011000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xe2,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e21198 <unknown>
 
 bfmls   za.h[w8, 0], {z12.h - z13.h}, {z2.h - z3.h}  // 11000001-11100010-00010001-10011000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xe2,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e21198 <unknown>
 
 bfmls   za.h[w10, 1, vgx2], {z0.h, z1.h}, {z26.h, z27.h}  // 11000001, 11111010-01010000-00011001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xfa,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fa5019 <unknown>
 
 bfmls   za.h[w10, 1], {z0.h - z1.h}, {z26.h - z27.h}  // 11000001-11111010-01010000-00011001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xfa,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fa5019 <unknown>
 
 bfmls   za.h[w8, 5, vgx2], {z22.h, z23.h}, {z30.h, z31.h}  // 11000001, 11111110-00010010-11011101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x12,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe12dd <unknown>
 
 bfmls   za.h[w8, 5], {z22.h - z23.h}, {z30.h - z31.h}  // 11000001-11111110-00010010-11011101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x12,0xfe,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fe12dd <unknown>
 
 bfmls   za.h[w11, 2, vgx2], {z8.h, z9.h}, {z0.h, z1.h}  // 11000001, 11100000-01110001-00011010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x1a,0x71,0xe0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e0711a <unknown>
 
 bfmls   za.h[w11, 2], {z8.h - z9.h}, {z0.h - z1.h}  // 11000001-11100000-01110001-00011010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x1a,0x71,0xe0,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e0711a <unknown>
 
 bfmls   za.h[w9, 7, vgx2], {z12.h, z13.h}, {z10.h, z11.h}  // 11000001, 11101010-00110001-10011111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xea,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1ea319f <unknown>
 
 bfmls   za.h[w9, 7], {z12.h - z13.h}, {z10.h - z11.h}  // 11000001-11101010-00110001-10011111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xea,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1ea319f <unknown>
 
 bfmls   za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h  // 11000001-01110000-00011100-00001000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x70,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1701c08 <unknown>
 
 bfmls   za.h[w8, 0], {z0.h - z3.h}, z0.h  // 11000001-01110000-00011100-00001000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x70,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1701c08 <unknown>
 
 bfmls   za.h[w10, 5, vgx4], {z10.h - z13.h}, z5.h  // 11000001-01110101-01011101-01001101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x75,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1755d4d <unknown>
 
 bfmls   za.h[w10, 5], {z10.h - z13.h}, z5.h  // 11000001-01110101-01011101-01001101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x75,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1755d4d <unknown>
 
 bfmls   za.h[w11, 7, vgx4], {z13.h - z16.h}, z8.h  // 11000001-01111000-01111101-10101111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x78,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1787daf <unknown>
 
 bfmls   za.h[w11, 7], {z13.h - z16.h}, z8.h  // 11000001-01111000-01111101-10101111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x78,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1787daf <unknown>
 
 bfmls   za.h[w11, 7, vgx4], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-01111111-01111111-11101111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x7f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17f7fef <unknown>
 
 bfmls   za.h[w11, 7], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-01111111-01111111-11101111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x7f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17f7fef <unknown>
 
 bfmls   za.h[w8, 5, vgx4], {z17.h - z20.h}, z0.h  // 11000001-01110000-00011110-00101101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x70,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1701e2d <unknown>
 
 bfmls   za.h[w8, 5], {z17.h - z20.h}, z0.h  // 11000001-01110000-00011110-00101101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x70,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1701e2d <unknown>
 
 bfmls   za.h[w8, 1, vgx4], {z1.h - z4.h}, z14.h  // 11000001-01111110-00011100-00101001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x7e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17e1c29 <unknown>
 
 bfmls   za.h[w8, 1], {z1.h - z4.h}, z14.h  // 11000001-01111110-00011100-00101001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x7e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17e1c29 <unknown>
 
 bfmls   za.h[w10, 0, vgx4], {z19.h - z22.h}, z4.h  // 11000001-01110100-01011110-01101000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x74,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1745e68 <unknown>
 
 bfmls   za.h[w10, 0], {z19.h - z22.h}, z4.h  // 11000001-01110100-01011110-01101000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x74,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1745e68 <unknown>
 
 bfmls   za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h  // 11000001-01110010-00011101-10001000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x72,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1721d88 <unknown>
 
 bfmls   za.h[w8, 0], {z12.h - z15.h}, z2.h  // 11000001-01110010-00011101-10001000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x72,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1721d88 <unknown>
 
 bfmls   za.h[w10, 1, vgx4], {z1.h - z4.h}, z10.h  // 11000001-01111010-01011100-00101001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x7a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17a5c29 <unknown>
 
 bfmls   za.h[w10, 1], {z1.h - z4.h}, z10.h  // 11000001-01111010-01011100-00101001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x7a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17a5c29 <unknown>
 
 bfmls   za.h[w8, 5, vgx4], {z22.h - z25.h}, z14.h  // 11000001-01111110-00011110-11001101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x7e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17e1ecd <unknown>
 
 bfmls   za.h[w8, 5], {z22.h - z25.h}, z14.h  // 11000001-01111110-00011110-11001101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x7e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17e1ecd <unknown>
 
 bfmls   za.h[w11, 2, vgx4], {z9.h - z12.h}, z1.h  // 11000001-01110001-01111101-00101010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x71,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1717d2a <unknown>
 
 bfmls   za.h[w11, 2], {z9.h - z12.h}, z1.h  // 11000001-01110001-01111101-00101010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x71,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1717d2a <unknown>
 
 bfmls   za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h  // 11000001-01111011-00111101-10001111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x7b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17b3d8f <unknown>
 
 bfmls   za.h[w9, 7], {z12.h - z15.h}, z11.h  // 11000001-01111011-00111101-10001111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x7b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c17b3d8f <unknown>
 
 bfmls   za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00110000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x30,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1109030 <unknown>
 
 bfmls   za.h[w8, 0], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00110000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x30,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1109030 <unknown>
 
 bfmls   za.h[w10, 5, vgx4], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00110101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x35,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c115d535 <unknown>
 
 bfmls   za.h[w10, 5], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00110101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x35,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c115d535 <unknown>
 
 bfmls   za.h[w11, 7, vgx4], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10110111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0xb7,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c118fdb7 <unknown>
 
 bfmls   za.h[w11, 7], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10110111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0xb7,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c118fdb7 <unknown>
 
 bfmls   za.h[w11, 7, vgx4], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10111111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xbf,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11fffbf <unknown>
 
 bfmls   za.h[w11, 7], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10111111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xbf,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11fffbf <unknown>
 
 bfmls   za.h[w8, 5, vgx4], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00110101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x35,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1109e35 <unknown>
 
 bfmls   za.h[w8, 5], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00110101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x35,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1109e35 <unknown>
 
 bfmls   za.h[w8, 1, vgx4], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00110001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x31,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e9431 <unknown>
 
 bfmls   za.h[w8, 1], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00110001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x31,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e9431 <unknown>
 
 bfmls   za.h[w10, 0, vgx4], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00111000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x38,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c114d638 <unknown>
 
 bfmls   za.h[w10, 0], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00111000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x38,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c114d638 <unknown>
 
 bfmls   za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10110000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0xb0,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11299b0 <unknown>
 
 bfmls   za.h[w8, 0], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10110000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0xb0,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11299b0 <unknown>
 
 bfmls   za.h[w10, 1, vgx4], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00110001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x31,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11ad831 <unknown>
 
 bfmls   za.h[w10, 1], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00110001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x31,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11ad831 <unknown>
 
 bfmls   za.h[w8, 5, vgx4], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10111101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xbd,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e9abd <unknown>
 
 bfmls   za.h[w8, 5], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10111101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xbd,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11e9abd <unknown>
 
 bfmls   za.h[w11, 2, vgx4], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00110010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x32,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c111f532 <unknown>
 
 bfmls   za.h[w11, 2], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00110010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x32,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c111f532 <unknown>
 
 bfmls   za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10110111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0xb7,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11bb9b7 <unknown>
 
 bfmls   za.h[w9, 7], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10110111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0xb7,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c11bb9b7 <unknown>
 
 bfmls   za.h[w8, 0, vgx4], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-11100001-00010000-00011000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x18,0x10,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e11018 <unknown>
 
 bfmls   za.h[w8, 0], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-11100001-00010000-00011000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x18,0x10,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e11018 <unknown>
 
 bfmls   za.h[w10, 5, vgx4], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-11110101-01010001-00011101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x1d,0x51,0xf5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f5511d <unknown>
 
 bfmls   za.h[w10, 5], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-11110101-01010001-00011101
 // CHECK-INST: bfmls   za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x1d,0x51,0xf5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f5511d <unknown>
 
 bfmls   za.h[w11, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-11101001-01110001-10011111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x71,0xe9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e9719f <unknown>
 
 bfmls   za.h[w11, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-11101001-01110001-10011111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x71,0xe9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e9719f <unknown>
 
 bfmls   za.h[w11, 7, vgx4], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-11111101-01110011-10011111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9f,0x73,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd739f <unknown>
 
 bfmls   za.h[w11, 7], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-11111101-01110011-10011111
 // CHECK-INST: bfmls   za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9f,0x73,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd739f <unknown>
 
 bfmls   za.h[w8, 5, vgx4], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-11110001-00010010-00011101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x1d,0x12,0xf1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f1121d <unknown>
 
 bfmls   za.h[w8, 5], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-11110001-00010010-00011101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x1d,0x12,0xf1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f1121d <unknown>
 
 bfmls   za.h[w8, 1, vgx4], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-11111101-00010000-00011001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd1019 <unknown>
 
 bfmls   za.h[w8, 1], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-11111101-00010000-00011001
 // CHECK-INST: bfmls   za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd1019 <unknown>
 
 bfmls   za.h[w10, 0, vgx4], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-11110101-01010010-00011000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x18,0x52,0xf5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f55218 <unknown>
 
 bfmls   za.h[w10, 0], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-11110101-01010010-00011000
 // CHECK-INST: bfmls   za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x18,0x52,0xf5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f55218 <unknown>
 
 bfmls   za.h[w8, 0, vgx4], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-11100001-00010001-10011000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e11198 <unknown>
 
 bfmls   za.h[w8, 0], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-11100001-00010001-10011000
 // CHECK-INST: bfmls   za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e11198 <unknown>
 
 bfmls   za.h[w10, 1, vgx4], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-11111001-01010000-00011001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xf9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f95019 <unknown>
 
 bfmls   za.h[w10, 1], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-11111001-01010000-00011001
 // CHECK-INST: bfmls   za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xf9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1f95019 <unknown>
 
 bfmls   za.h[w8, 5, vgx4], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-11111101-00010010-10011101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9d,0x12,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd129d <unknown>
 
 bfmls   za.h[w8, 5], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-11111101-00010010-10011101
 // CHECK-INST: bfmls   za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9d,0x12,0xfd,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1fd129d <unknown>
 
 bfmls   za.h[w11, 2, vgx4], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-11100001-01110001-00011010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x1a,0x71,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e1711a <unknown>
 
 bfmls   za.h[w11, 2], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-11100001-01110001-00011010
 // CHECK-INST: bfmls   za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x1a,0x71,0xe1,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e1711a <unknown>
 
 bfmls   za.h[w9, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-11101001-00110001-10011111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xe9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e9319f <unknown>
 
 bfmls   za.h[w9, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-11101001-00110001-10011111
 // CHECK-INST: bfmls   za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xe9,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e9319f <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmopa-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfmopa-diagnostics.s
similarity index 93%
rename from llvm/test/MC/AArch64/SME2p1/bfmopa-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfmopa-diagnostics.s
index 8b418f4a78cf4..e6d208db9ac14 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmopa-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfmopa-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid predicate register
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmopa.s b/llvm/test/MC/AArch64/SME2/bfmopa.s
similarity index 70%
rename from llvm/test/MC/AArch64/SME2p1/bfmopa.s
rename to llvm/test/MC/AArch64/SME2/bfmopa.s
index 7a08185f18961..2a9b1e5354ac4 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmopa.s
+++ b/llvm/test/MC/AArch64/SME2/bfmopa.s
@@ -1,84 +1,84 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfmopa  za0.h, p0/m, p0/m, z0.h, z0.h  // 10000001-10100000-00000000-00001000
 // CHECK-INST: bfmopa  za0.h, p0/m, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x08,0x00,0xa0,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81a00008 <unknown>
 
 bfmopa  za1.h, p5/m, p2/m, z10.h, z21.h  // 10000001-10110101-01010101-01001001
 // CHECK-INST: bfmopa  za1.h, p5/m, p2/m, z10.h, z21.h
 // CHECK-ENCODING: [0x49,0x55,0xb5,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81b55549 <unknown>
 
 bfmopa  za1.h, p3/m, p7/m, z13.h, z8.h  // 10000001-10101000-11101101-10101001
 // CHECK-INST: bfmopa  za1.h, p3/m, p7/m, z13.h, z8.h
 // CHECK-ENCODING: [0xa9,0xed,0xa8,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81a8eda9 <unknown>
 
 bfmopa  za1.h, p7/m, p7/m, z31.h, z31.h  // 10000001-10111111-11111111-11101001
 // CHECK-INST: bfmopa  za1.h, p7/m, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xe9,0xff,0xbf,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81bfffe9 <unknown>
 
 bfmopa  za1.h, p3/m, p0/m, z17.h, z16.h  // 10000001-10110000-00001110-00101001
 // CHECK-INST: bfmopa  za1.h, p3/m, p0/m, z17.h, z16.h
 // CHECK-ENCODING: [0x29,0x0e,0xb0,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81b00e29 <unknown>
 
 bfmopa  za1.h, p1/m, p4/m, z1.h, z30.h  // 10000001-10111110-10000100-00101001
 // CHECK-INST: bfmopa  za1.h, p1/m, p4/m, z1.h, z30.h
 // CHECK-ENCODING: [0x29,0x84,0xbe,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81be8429 <unknown>
 
 bfmopa  za0.h, p5/m, p2/m, z19.h, z20.h  // 10000001-10110100-01010110-01101000
 // CHECK-INST: bfmopa  za0.h, p5/m, p2/m, z19.h, z20.h
 // CHECK-ENCODING: [0x68,0x56,0xb4,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81b45668 <unknown>
 
 bfmopa  za0.h, p6/m, p0/m, z12.h, z2.h  // 10000001-10100010-00011001-10001000
 // CHECK-INST: bfmopa  za0.h, p6/m, p0/m, z12.h, z2.h
 // CHECK-ENCODING: [0x88,0x19,0xa2,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81a21988 <unknown>
 
 bfmopa  za1.h, p2/m, p6/m, z1.h, z26.h  // 10000001-10111010-11001000-00101001
 // CHECK-INST: bfmopa  za1.h, p2/m, p6/m, z1.h, z26.h
 // CHECK-ENCODING: [0x29,0xc8,0xba,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81bac829 <unknown>
 
 bfmopa  za1.h, p2/m, p0/m, z22.h, z30.h  // 10000001-10111110-00001010-11001001
 // CHECK-INST: bfmopa  za1.h, p2/m, p0/m, z22.h, z30.h
 // CHECK-ENCODING: [0xc9,0x0a,0xbe,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81be0ac9 <unknown>
 
 bfmopa  za0.h, p5/m, p7/m, z9.h, z1.h  // 10000001-10100001-11110101-00101000
 // CHECK-INST: bfmopa  za0.h, p5/m, p7/m, z9.h, z1.h
 // CHECK-ENCODING: [0x28,0xf5,0xa1,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81a1f528 <unknown>
 
 bfmopa  za1.h, p2/m, p5/m, z12.h, z11.h  // 10000001-10101011-10101001-10001001
 // CHECK-INST: bfmopa  za1.h, p2/m, p5/m, z12.h, z11.h
 // CHECK-ENCODING: [0x89,0xa9,0xab,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81aba989 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmops-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfmops-diagnostics.s
similarity index 93%
rename from llvm/test/MC/AArch64/SME2p1/bfmops-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfmops-diagnostics.s
index 84275aff7091e..14c25dea8e795 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmops-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfmops-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid predicate register
diff --git a/llvm/test/MC/AArch64/SME2p1/bfmops.s b/llvm/test/MC/AArch64/SME2/bfmops.s
similarity index 70%
rename from llvm/test/MC/AArch64/SME2p1/bfmops.s
rename to llvm/test/MC/AArch64/SME2/bfmops.s
index 380c65f1c7ccc..2b76986d10dd9 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfmops.s
+++ b/llvm/test/MC/AArch64/SME2/bfmops.s
@@ -1,84 +1,84 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfmops  za0.h, p0/m, p0/m, z0.h, z0.h  // 10000001-10100000-00000000-00011000
 // CHECK-INST: bfmops  za0.h, p0/m, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x18,0x00,0xa0,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81a00018 <unknown>
 
 bfmops  za1.h, p5/m, p2/m, z10.h, z21.h  // 10000001-10110101-01010101-01011001
 // CHECK-INST: bfmops  za1.h, p5/m, p2/m, z10.h, z21.h
 // CHECK-ENCODING: [0x59,0x55,0xb5,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81b55559 <unknown>
 
 bfmops  za1.h, p3/m, p7/m, z13.h, z8.h  // 10000001-10101000-11101101-10111001
 // CHECK-INST: bfmops  za1.h, p3/m, p7/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb9,0xed,0xa8,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81a8edb9 <unknown>
 
 bfmops  za1.h, p7/m, p7/m, z31.h, z31.h  // 10000001-10111111-11111111-11111001
 // CHECK-INST: bfmops  za1.h, p7/m, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xf9,0xff,0xbf,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81bffff9 <unknown>
 
 bfmops  za1.h, p3/m, p0/m, z17.h, z16.h  // 10000001-10110000-00001110-00111001
 // CHECK-INST: bfmops  za1.h, p3/m, p0/m, z17.h, z16.h
 // CHECK-ENCODING: [0x39,0x0e,0xb0,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81b00e39 <unknown>
 
 bfmops  za1.h, p1/m, p4/m, z1.h, z30.h  // 10000001-10111110-10000100-00111001
 // CHECK-INST: bfmops  za1.h, p1/m, p4/m, z1.h, z30.h
 // CHECK-ENCODING: [0x39,0x84,0xbe,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81be8439 <unknown>
 
 bfmops  za0.h, p5/m, p2/m, z19.h, z20.h  // 10000001-10110100-01010110-01111000
 // CHECK-INST: bfmops  za0.h, p5/m, p2/m, z19.h, z20.h
 // CHECK-ENCODING: [0x78,0x56,0xb4,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81b45678 <unknown>
 
 bfmops  za0.h, p6/m, p0/m, z12.h, z2.h  // 10000001-10100010-00011001-10011000
 // CHECK-INST: bfmops  za0.h, p6/m, p0/m, z12.h, z2.h
 // CHECK-ENCODING: [0x98,0x19,0xa2,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81a21998 <unknown>
 
 bfmops  za1.h, p2/m, p6/m, z1.h, z26.h  // 10000001-10111010-11001000-00111001
 // CHECK-INST: bfmops  za1.h, p2/m, p6/m, z1.h, z26.h
 // CHECK-ENCODING: [0x39,0xc8,0xba,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81bac839 <unknown>
 
 bfmops  za1.h, p2/m, p0/m, z22.h, z30.h  // 10000001-10111110-00001010-11011001
 // CHECK-INST: bfmops  za1.h, p2/m, p0/m, z22.h, z30.h
 // CHECK-ENCODING: [0xd9,0x0a,0xbe,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81be0ad9 <unknown>
 
 bfmops  za0.h, p5/m, p7/m, z9.h, z1.h  // 10000001-10100001-11110101-00111000
 // CHECK-INST: bfmops  za0.h, p5/m, p7/m, z9.h, z1.h
 // CHECK-ENCODING: [0x38,0xf5,0xa1,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81a1f538 <unknown>
 
 bfmops  za1.h, p2/m, p5/m, z12.h, z11.h  // 10000001-10101011-10101001-10011001
 // CHECK-INST: bfmops  za1.h, p2/m, p5/m, z12.h, z11.h
 // CHECK-ENCODING: [0x99,0xa9,0xab,0x81]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: 81aba999 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/bfsub-diagnostics.s b/llvm/test/MC/AArch64/SME2/bfsub-diagnostics.s
similarity index 95%
rename from llvm/test/MC/AArch64/SME2p1/bfsub-diagnostics.s
rename to llvm/test/MC/AArch64/SME2/bfsub-diagnostics.s
index 9d3680ea560e6..5dade3e7ed2f3 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfsub-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/bfsub-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Out of range index offset
diff --git a/llvm/test/MC/AArch64/SME2p1/bfsub.s b/llvm/test/MC/AArch64/SME2/bfsub.s
similarity index 77%
rename from llvm/test/MC/AArch64/SME2p1/bfsub.s
rename to llvm/test/MC/AArch64/SME2/bfsub.s
index dac5f97c96179..6cfd5073b3bca 100644
--- a/llvm/test/MC/AArch64/SME2p1/bfsub.s
+++ b/llvm/test/MC/AArch64/SME2/bfsub.s
@@ -1,300 +1,300 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+b16b16 < %s \
-// RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+b16b16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+b16b16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+b16b16 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+b16b16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+b16b16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+b16b16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 bfsub   za.h[w8, 0, vgx2], {z0.h, z1.h}  // 11000001-11100100-00011100-00001000
 // CHECK-INST: bfsub   za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x1c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41c08 <unknown>
 
 bfsub   za.h[w8, 0], {z0.h - z1.h}  // 11000001-11100100-00011100-00001000
 // CHECK-INST: bfsub   za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x1c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41c08 <unknown>
 
 bfsub   za.h[w10, 5, vgx2], {z10.h, z11.h}  // 11000001-11100100-01011101-01001101
 // CHECK-INST: bfsub   za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x4d,0x5d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45d4d <unknown>
 
 bfsub   za.h[w10, 5], {z10.h - z11.h}  // 11000001-11100100-01011101-01001101
 // CHECK-INST: bfsub   za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x4d,0x5d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45d4d <unknown>
 
 bfsub   za.h[w11, 7, vgx2], {z12.h, z13.h}  // 11000001-11100100-01111101-10001111
 // CHECK-INST: bfsub   za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47d8f <unknown>
 
 bfsub   za.h[w11, 7], {z12.h - z13.h}  // 11000001-11100100-01111101-10001111
 // CHECK-INST: bfsub   za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47d8f <unknown>
 
 bfsub   za.h[w11, 7, vgx2], {z30.h, z31.h}  // 11000001-11100100-01111111-11001111
 // CHECK-INST: bfsub   za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x7f,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47fcf <unknown>
 
 bfsub   za.h[w11, 7], {z30.h - z31.h}  // 11000001-11100100-01111111-11001111
 // CHECK-INST: bfsub   za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x7f,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47fcf <unknown>
 
 bfsub   za.h[w8, 5, vgx2], {z16.h, z17.h}  // 11000001-11100100-00011110-00001101
 // CHECK-INST: bfsub   za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41e0d <unknown>
 
 bfsub   za.h[w8, 5], {z16.h - z17.h}  // 11000001-11100100-00011110-00001101
 // CHECK-INST: bfsub   za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41e0d <unknown>
 
 bfsub   za.h[w8, 1, vgx2], {z0.h, z1.h}  // 11000001-11100100-00011100-00001001
 // CHECK-INST: bfsub   za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x1c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41c09 <unknown>
 
 bfsub   za.h[w8, 1], {z0.h - z1.h}  // 11000001-11100100-00011100-00001001
 // CHECK-INST: bfsub   za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x1c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41c09 <unknown>
 
 bfsub   za.h[w10, 0, vgx2], {z18.h, z19.h}  // 11000001-11100100-01011110, 01001000
 // CHECK-INST: bfsub   za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x48,0x5e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45e48 <unknown>
 
 bfsub   za.h[w10, 0], {z18.h - z19.h}  // 11000001-11100100-01011110-01001000
 // CHECK-INST: bfsub   za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x48,0x5e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45e48 <unknown>
 
 bfsub   za.h[w8, 0, vgx2], {z12.h, z13.h}  // 11000001-11100100-00011101-10001000
 // CHECK-INST: bfsub   za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x88,0x1d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41d88 <unknown>
 
 bfsub   za.h[w8, 0], {z12.h - z13.h}  // 11000001-11100100-00011101-10001000
 // CHECK-INST: bfsub   za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x88,0x1d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41d88 <unknown>
 
 bfsub   za.h[w10, 1, vgx2], {z0.h, z1.h}  // 11000001-11100100-01011100-00001001
 // CHECK-INST: bfsub   za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x5c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45c09 <unknown>
 
 bfsub   za.h[w10, 1], {z0.h - z1.h}  // 11000001-11100100-01011100-00001001
 // CHECK-INST: bfsub   za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x5c,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e45c09 <unknown>
 
 bfsub   za.h[w8, 5, vgx2], {z22.h, z23.h}  // 11000001-11100100-00011110, 11001101
 // CHECK-INST: bfsub   za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xcd,0x1e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41ecd <unknown>
 
 bfsub   za.h[w8, 5], {z22.h - z23.h}  // 11000001-11100100-00011110-11001101
 // CHECK-INST: bfsub   za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xcd,0x1e,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e41ecd <unknown>
 
 bfsub   za.h[w11, 2, vgx2], {z8.h, z9.h}  // 11000001-11100100-01111101-00001010
 // CHECK-INST: bfsub   za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47d0a <unknown>
 
 bfsub   za.h[w11, 2], {z8.h - z9.h}  // 11000001-11100100-01111101-00001010
 // CHECK-INST: bfsub   za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e47d0a <unknown>
 
 bfsub   za.h[w9, 7, vgx2], {z12.h, z13.h}  // 11000001-11100100-00111101-10001111
 // CHECK-INST: bfsub   za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e43d8f <unknown>
 
 bfsub   za.h[w9, 7], {z12.h - z13.h}  // 11000001-11100100-00111101-10001111
 // CHECK-INST: bfsub   za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xe4,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e43d8f <unknown>
 
 bfsub   za.h[w8, 0, vgx4], {z0.h - z3.h}  // 11000001-11100101-00011100-00001000
 // CHECK-INST: bfsub   za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x1c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51c08 <unknown>
 
 bfsub   za.h[w8, 0], {z0.h - z3.h}  // 11000001-11100101-00011100-00001000
 // CHECK-INST: bfsub   za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x1c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51c08 <unknown>
 
 bfsub   za.h[w10, 5, vgx4], {z8.h - z11.h}  // 11000001-11100101-01011101-00001101
 // CHECK-INST: bfsub   za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0d,0x5d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55d0d <unknown>
 
 bfsub   za.h[w10, 5], {z8.h - z11.h}  // 11000001-11100101-01011101-00001101
 // CHECK-INST: bfsub   za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0d,0x5d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55d0d <unknown>
 
 bfsub   za.h[w11, 7, vgx4], {z12.h - z15.h}  // 11000001-11100101-01111101-10001111
 // CHECK-INST: bfsub   za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57d8f <unknown>
 
 bfsub   za.h[w11, 7], {z12.h - z15.h}  // 11000001-11100101-01111101-10001111
 // CHECK-INST: bfsub   za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57d8f <unknown>
 
 bfsub   za.h[w11, 7, vgx4], {z28.h - z31.h}  // 11000001-11100101-01111111-10001111
 // CHECK-INST: bfsub   za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x7f,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57f8f <unknown>
 
 bfsub   za.h[w11, 7], {z28.h - z31.h}  // 11000001-11100101-01111111-10001111
 // CHECK-INST: bfsub   za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x7f,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57f8f <unknown>
 
 bfsub   za.h[w8, 5, vgx4], {z16.h - z19.h}  // 11000001-11100101-00011110-00001101
 // CHECK-INST: bfsub   za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51e0d <unknown>
 
 bfsub   za.h[w8, 5], {z16.h - z19.h}  // 11000001-11100101-00011110-00001101
 // CHECK-INST: bfsub   za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51e0d <unknown>
 
 bfsub   za.h[w8, 1, vgx4], {z0.h - z3.h}  // 11000001-11100101-00011100-00001001
 // CHECK-INST: bfsub   za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x1c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51c09 <unknown>
 
 bfsub   za.h[w8, 1], {z0.h - z3.h}  // 11000001-11100101-00011100-00001001
 // CHECK-INST: bfsub   za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x1c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51c09 <unknown>
 
 bfsub   za.h[w10, 0, vgx4], {z16.h - z19.h}  // 11000001-11100101-01011110-00001000
 // CHECK-INST: bfsub   za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x08,0x5e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55e08 <unknown>
 
 bfsub   za.h[w10, 0], {z16.h - z19.h}  // 11000001-11100101-01011110-00001000
 // CHECK-INST: bfsub   za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x08,0x5e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55e08 <unknown>
 
 bfsub   za.h[w8, 0, vgx4], {z12.h - z15.h}  // 11000001-11100101-00011101-10001000
 // CHECK-INST: bfsub   za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x88,0x1d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51d88 <unknown>
 
 bfsub   za.h[w8, 0], {z12.h - z15.h}  // 11000001-11100101-00011101-10001000
 // CHECK-INST: bfsub   za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x88,0x1d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51d88 <unknown>
 
 bfsub   za.h[w10, 1, vgx4], {z0.h - z3.h}  // 11000001-11100101-01011100-00001001
 // CHECK-INST: bfsub   za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x5c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55c09 <unknown>
 
 bfsub   za.h[w10, 1], {z0.h - z3.h}  // 11000001-11100101-01011100-00001001
 // CHECK-INST: bfsub   za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x5c,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e55c09 <unknown>
 
 bfsub   za.h[w8, 5, vgx4], {z20.h - z23.h}  // 11000001-11100101-00011110-10001101
 // CHECK-INST: bfsub   za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x8d,0x1e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51e8d <unknown>
 
 bfsub   za.h[w8, 5], {z20.h - z23.h}  // 11000001-11100101-00011110-10001101
 // CHECK-INST: bfsub   za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x8d,0x1e,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e51e8d <unknown>
 
 bfsub   za.h[w11, 2, vgx4], {z8.h - z11.h}  // 11000001-11100101-01111101-00001010
 // CHECK-INST: bfsub   za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57d0a <unknown>
 
 bfsub   za.h[w11, 2], {z8.h - z11.h}  // 11000001-11100101-01111101-00001010
 // CHECK-INST: bfsub   za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e57d0a <unknown>
 
 bfsub   za.h[w9, 7, vgx4], {z12.h - z15.h}  // 11000001-11100101-00111101-10001111
 // CHECK-INST: bfsub   za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e53d8f <unknown>
 
 bfsub   za.h[w9, 7], {z12.h - z15.h}  // 11000001-11100101-00111101-10001111
 // CHECK-INST: bfsub   za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xe5,0xc1]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1
+// CHECK-ERROR: instruction requires: b16b16 sme2
 // CHECK-UNKNOWN: c1e53d8f <unknown>

From 5fc712c4bbe84e6cbaa1f7d2a0300f613f11b0c3 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Wed, 3 Jan 2024 09:56:34 +0000
Subject: [PATCH 113/313] [AArch64] Add PMOV aliases with implicit portion
 index (#76056)

In the latest ISA update the syntax of the PMOV instructions was updated
to allow the portion index to be optional, default to zero.

https://developer.arm.com/documentation/ddi0602/2023-09/SVE-Instructions/PMOV--to-predicate---Move-predicate-from-vector-?lang=en

https://developer.arm.com/documentation/ddi0602/2023-09/SVE-Instructions/PMOV--to-vector---Move-predicate-to-vector-?lang=en
---
 llvm/lib/Target/AArch64/SVEInstrFormats.td | 12 ++++++++
 llvm/test/MC/AArch64/SVE2p1/pmov.s         | 36 ++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index b7552541e950d..789ec817d3d8b 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10082,6 +10082,12 @@ multiclass sve2p1_vector_to_pred<string mnemonic, SDPatternOperator Op_lane, SDP
 
   def : InstAlias<mnemonic # "\t$Pd, $Zn",
                  (!cast<Instruction>(NAME # _B) PPR8:$Pd, ZPRAny:$Zn, 0), 1>;
+  def : InstAlias<mnemonic # "\t$Pd, $Zn",
+                 (!cast<Instruction>(NAME # _H) PPR16:$Pd, ZPRAny:$Zn, 0), 0>;
+  def : InstAlias<mnemonic # "\t$Pd, $Zn",
+                 (!cast<Instruction>(NAME # _S) PPR32:$Pd, ZPRAny:$Zn, 0), 0>;
+  def : InstAlias<mnemonic # "\t$Pd, $Zn",
+                 (!cast<Instruction>(NAME # _D) PPR64:$Pd, ZPRAny:$Zn, 0), 0>;
 
   // any_lane
   def : Pat<(nxv16i1 (Op_lane (nxv16i8 ZPRAny:$Zn), (i32 timm32_0_0:$Idx))),
@@ -10143,6 +10149,12 @@ multiclass sve2p1_pred_to_vector<string mnemonic, SDPatternOperator MergeOp,
 
   def : InstAlias<mnemonic # "\t$Zd, $Pn",
                  (!cast<Instruction>(NAME # _B) ZPRAny:$Zd, 0, PPR8:$Pn), 1>;
+  def : InstAlias<mnemonic # "\t$Zd, $Pn",
+                 (!cast<Instruction>(NAME # _H) ZPRAny:$Zd, 0, PPR16:$Pn), 0>;
+  def : InstAlias<mnemonic # "\t$Zd, $Pn",
+                 (!cast<Instruction>(NAME # _S) ZPRAny:$Zd, 0, PPR32:$Pn), 0>;
+  def : InstAlias<mnemonic # "\t$Zd, $Pn",
+                 (!cast<Instruction>(NAME # _D) ZPRAny:$Zd, 0, PPR64:$Pn), 0>;
 
   // Merge
   def : Pat<(nxv8i16 (MergeOp (nxv8i16 ZPRAny:$Zd), (nxv8i1 PPR16:$Pn), (i32 timm32_1_1:$Idx))),
diff --git a/llvm/test/MC/AArch64/SVE2p1/pmov.s b/llvm/test/MC/AArch64/SVE2p1/pmov.s
index ca2b55b47c325..8c08cf7e6441a 100644
--- a/llvm/test/MC/AArch64/SVE2p1/pmov.s
+++ b/llvm/test/MC/AArch64/SVE2p1/pmov.s
@@ -20,6 +20,12 @@ pmov    p0.h, z0[0]  // 00000101-00101100-00111000-00000000
 // CHECK-ERROR: instruction requires: sme2p1 or sve2p1
 // CHECK-UNKNOWN: 052c3800 <unknown>
 
+pmov    p0.h, z0  // 00000101-00101100-00111000-00000000
+// CHECK-INST: pmov    p0.h, z0[0]
+// CHECK-ENCODING: [0x00,0x38,0x2c,0x05]
+// CHECK-ERROR: instruction requires: sme2p1 or sve2p1
+// CHECK-UNKNOWN: 052c3800 <unknown>
+
 pmov    p5.h, z10[0]  // 00000101-00101100-00111001-01000101
 // CHECK-INST: pmov    p5.h, z10[0]
 // CHECK-ENCODING: [0x45,0x39,0x2c,0x05]
@@ -44,6 +50,12 @@ pmov    p0.s, z0[0]  // 00000101-01101000-00111000-00000000
 // CHECK-ERROR: instruction requires: sme2p1 or sve2p1
 // CHECK-UNKNOWN: 05683800 <unknown>
 
+pmov    p0.s, z0  // 00000101-01101000-00111000-00000000
+// CHECK-INST: pmov    p0.s, z0[0]
+// CHECK-ENCODING: [0x00,0x38,0x68,0x05]
+// CHECK-ERROR: instruction requires: sme2p1 or sve2p1
+// CHECK-UNKNOWN: 05683800 <unknown>
+
 pmov    p5.s, z10[2]  // 00000101-01101100-00111001-01000101
 // CHECK-INST: pmov    p5.s, z10[2]
 // CHECK-ENCODING: [0x45,0x39,0x6c,0x05]
@@ -68,6 +80,12 @@ pmov    p0.d, z0[0]  // 00000101-10101000-00111000-00000000
 // CHECK-ERROR: instruction requires: sme2p1 or sve2p1
 // CHECK-UNKNOWN: 05a83800 <unknown>
 
+pmov    p0.d, z0  // 00000101-10101000-00111000-00000000
+// CHECK-INST: pmov    p0.d, z0[0]
+// CHECK-ENCODING: [0x00,0x38,0xa8,0x05]
+// CHECK-ERROR: instruction requires: sme2p1 or sve2p1
+// CHECK-UNKNOWN: 05a83800 <unknown>
+
 pmov    p5.d, z10[6]  // 00000101-11101100-00111001-01000101
 // CHECK-INST: pmov    p5.d, z10[6]
 // CHECK-ENCODING: [0x45,0x39,0xec,0x05]
@@ -122,6 +140,12 @@ pmov    z0[0], p0.h  // 00000101-00101101-00111000-00000000
 // CHECK-ERROR: instruction requires: sme2p1 or sve2p1
 // CHECK-UNKNOWN: 052d3800 <unknown>
 
+pmov    z0, p0.h  // 00000101-00101101-00111000-00000000
+// CHECK-INST: pmov    z0[0], p0.h
+// CHECK-ENCODING: [0x00,0x38,0x2d,0x05]
+// CHECK-ERROR: instruction requires: sme2p1 or sve2p1
+// CHECK-UNKNOWN: 052d3800 <unknown>
+
 pmov    z21[0], p10.h  // 00000101-00101101-00111001-01010101
 // CHECK-INST: pmov    z21[0], p10.h
 // CHECK-ENCODING: [0x55,0x39,0x2d,0x05]
@@ -147,6 +171,12 @@ pmov    z0[0], p0.s  // 00000101-01101001-00111000-00000000
 // CHECK-ERROR: instruction requires: sme2p1 or sve2p1
 // CHECK-UNKNOWN: 05693800 <unknown>
 
+pmov    z0, p0.s  // 00000101-01101001-00111000-00000000
+// CHECK-INST: pmov    z0[0], p0.s
+// CHECK-ENCODING: [0x00,0x38,0x69,0x05]
+// CHECK-ERROR: instruction requires: sme2p1 or sve2p1
+// CHECK-UNKNOWN: 05693800 <unknown>
+
 pmov    z21[2], p10.s  // 00000101-01101101-00111001-01010101
 // CHECK-INST: pmov    z21[2], p10.s
 // CHECK-ENCODING: [0x55,0x39,0x6d,0x05]
@@ -171,6 +201,12 @@ pmov    z0[0], p0.d  // 00000101-10101001-00111000-00000000
 // CHECK-ERROR: instruction requires: sme2p1 or sve2p1
 // CHECK-UNKNOWN: 05a93800 <unknown>
 
+pmov    z0, p0.d  // 00000101-10101001-00111000-00000000
+// CHECK-INST: pmov    z0[0], p0.d
+// CHECK-ENCODING: [0x00,0x38,0xa9,0x05]
+// CHECK-ERROR: instruction requires: sme2p1 or sve2p1
+// CHECK-UNKNOWN: 05a93800 <unknown>
+
 pmov    z21[6], p10.d  // 00000101-11101101-00111001-01010101
 // CHECK-INST: pmov    z21[6], p10.d
 // CHECK-ENCODING: [0x55,0x39,0xed,0x05]

From 54378a7c2fd7f0ed0a3ea7ef08bc24896700e2c5 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 3 Jan 2024 05:12:30 -0500
Subject: [PATCH 114/313] [lldb][Windows] Fix -Wmissing-field-initializers
 warnings after 54981bb75d374 (#76255)

---
 .../Windows/Common/x64/RegisterContextWindows_x64.cpp  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/Process/Windows/Common/x64/RegisterContextWindows_x64.cpp b/lldb/source/Plugins/Process/Windows/Common/x64/RegisterContextWindows_x64.cpp
index 5c4f80b97009a..fee485b7599ca 100644
--- a/lldb/source/Plugins/Process/Windows/Common/x64/RegisterContextWindows_x64.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/x64/RegisterContextWindows_x64.cpp
@@ -29,7 +29,7 @@ using namespace lldb_private;
   #reg, alt, 8, 0, eEncodingUint, eFormatHexUppercase,                         \
       {dwarf_##reg##_x86_64, dwarf_##reg##_x86_64, generic,                    \
         LLDB_INVALID_REGNUM, lldb_##reg##_x86_64 },                            \
-        nullptr, nullptr,                                                      \
+        nullptr, nullptr, nullptr,                                             \
 }
 
 #define DEFINE_GPR_BIN(reg, alt) #reg, alt, 8, 0, eEncodingUint, eFormatBinary
@@ -37,14 +37,14 @@ using namespace lldb_private;
   #reg, NULL, 16, 0, eEncodingUint, eFormatVectorOfUInt64,                     \
   {dwarf_##reg##_x86_64, dwarf_##reg##_x86_64, LLDB_INVALID_REGNUM,            \
    LLDB_INVALID_REGNUM, lldb_##reg##_x86_64},                                  \
-  nullptr, nullptr
+  nullptr, nullptr, nullptr,
 
 #define DEFINE_GPR_PSEUDO_32(reg)                                              \
 {                                                                              \
   #reg, nullptr, 4, 0, eEncodingUint, eFormatHexUppercase,                     \
       {LLDB_INVALID_REGNUM, LLDB_INVALID_REGNUM, LLDB_INVALID_REGNUM,          \
         LLDB_INVALID_REGNUM, lldb_##reg##_x86_64 },                            \
-        nullptr, nullptr                                                       \
+        nullptr, nullptr, nullptr,                                             \
 }
 
 #define DEFINE_GPR_PSEUDO_16(reg)                                              \
@@ -52,7 +52,7 @@ using namespace lldb_private;
   #reg, nullptr, 2, 0, eEncodingUint, eFormatHexUppercase,                     \
       {LLDB_INVALID_REGNUM, LLDB_INVALID_REGNUM, LLDB_INVALID_REGNUM,          \
         LLDB_INVALID_REGNUM, lldb_##reg##_x86_64 },                            \
-        nullptr, nullptr                                                       \
+        nullptr, nullptr, nullptr,                                             \
 }
 
 #define DEFINE_GPR_PSEUDO_8(reg)                                               \
@@ -60,7 +60,7 @@ using namespace lldb_private;
   #reg, nullptr, 1, 0, eEncodingUint, eFormatHexUppercase,                     \
       {LLDB_INVALID_REGNUM, LLDB_INVALID_REGNUM, LLDB_INVALID_REGNUM,          \
         LLDB_INVALID_REGNUM, lldb_##reg##_x86_64 },                            \
-        nullptr, nullptr                                                       \
+        nullptr, nullptr, nullptr,                                             \
 }
 
 namespace {

From 40236257ea9d1e451daee5c0938bd23abd5d1450 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Wed, 3 Jan 2024 11:21:22 +0100
Subject: [PATCH 115/313] [Orc] Deduplicate GDB JIT Interface declarations
 (NFC) (#76373)

https://github.com/llvm/llvm-project/pull/76236 introduced the forth
copy and it was time to deduplicate. This patch brings it back to 2,
one in OrcTargetProcess and one in legacy ExecutionEngine.
---
 .../Orc/TargetProcess/JITLoaderGDB.h          | 26 +++++++++++++++
 .../Orc/TargetProcess/JITLoaderGDB.cpp        | 23 -------------
 llvm/tools/lli/ExecutionUtils.cpp             | 32 +++---------------
 .../llvm-jitlink-executor.cpp                 | 33 ++-----------------
 .../ExecutionEngine/Orc/OrcCAPITest.cpp       | 33 ++-----------------
 5 files changed, 36 insertions(+), 111 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h
index 99175d7969747..9f91a64e95ce9 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h
@@ -16,6 +16,32 @@
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
 #include <cstdint>
 
+// Keep in sync with gdb/gdb/jit.h
+extern "C" {
+
+typedef enum {
+  JIT_NOACTION = 0,
+  JIT_REGISTER_FN,
+  JIT_UNREGISTER_FN
+} jit_actions_t;
+
+struct jit_code_entry {
+  struct jit_code_entry *next_entry;
+  struct jit_code_entry *prev_entry;
+  const char *symfile_addr;
+  uint64_t symfile_size;
+};
+
+struct jit_descriptor {
+  uint32_t version;
+  // This should be jit_actions_t, but we want to be specific about the
+  // bit-width.
+  uint32_t action_flag;
+  struct jit_code_entry *relevant_entry;
+  struct jit_code_entry *first_entry;
+};
+}
+
 extern "C" llvm::orc::shared::CWrapperFunctionResult
 llvm_orc_registerJITLoaderGDBWrapper(const char *Data, uint64_t Size);
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
index 8eca874c48b87..8a4145a6b02a2 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
@@ -21,31 +21,8 @@
 // First version as landed in August 2009
 static constexpr uint32_t JitDescriptorVersion = 1;
 
-// Keep in sync with gdb/gdb/jit.h
 extern "C" {
 
-typedef enum {
-  JIT_NOACTION = 0,
-  JIT_REGISTER_FN,
-  JIT_UNREGISTER_FN
-} jit_actions_t;
-
-struct jit_code_entry {
-  struct jit_code_entry *next_entry;
-  struct jit_code_entry *prev_entry;
-  const char *symfile_addr;
-  uint64_t symfile_size;
-};
-
-struct jit_descriptor {
-  uint32_t version;
-  // This should be jit_actions_t, but we want to be specific about the
-  // bit-width.
-  uint32_t action_flag;
-  struct jit_code_entry *relevant_entry;
-  struct jit_code_entry *first_entry;
-};
-
 // We put information about the JITed function in this global, which the
 // debugger reads.  Make sure to specify the version statically, because the
 // debugger checks the version before we can set it during runtime.
diff --git a/llvm/tools/lli/ExecutionUtils.cpp b/llvm/tools/lli/ExecutionUtils.cpp
index 55370ed40f2b6..b6cc3bb174d3c 100644
--- a/llvm/tools/lli/ExecutionUtils.cpp
+++ b/llvm/tools/lli/ExecutionUtils.cpp
@@ -8,6 +8,7 @@
 
 #include "ExecutionUtils.h"
 
+#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
@@ -15,34 +16,6 @@
 #include <cstdint>
 #include <vector>
 
-// Declarations follow the GDB JIT interface (version 1, 2009) and must match
-// those of the DYLD used for testing. See:
-//
-//   llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
-//   llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
-//
-typedef enum {
-  JIT_NOACTION = 0,
-  JIT_REGISTER_FN,
-  JIT_UNREGISTER_FN
-} jit_actions_t;
-
-struct jit_code_entry {
-  struct jit_code_entry *next_entry;
-  struct jit_code_entry *prev_entry;
-  const char *symfile_addr;
-  uint64_t symfile_size;
-};
-
-struct jit_descriptor {
-  uint32_t version;
-  // This should be jit_actions_t, but we want to be specific about the
-  // bit-width.
-  uint32_t action_flag;
-  struct jit_code_entry *relevant_entry;
-  struct jit_code_entry *first_entry;
-};
-
 namespace llvm {
 
 template <typename... Ts> static void outsv(const char *Fmt, Ts &&...Vals) {
@@ -61,6 +34,9 @@ static const char *actionFlagToStr(uint32_t ActionFlag) {
   return "<invalid action_flag>";
 }
 
+// Declarations follow the GDB JIT interface (version 1, 2009) and must match
+// those of the DYLD used for testing.
+//
 // Sample output:
 //
 //   Reading __jit_debug_descriptor at 0x0000000000404048
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
index 3a05c9b5be24d..034c43f9352a5 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
@@ -112,35 +112,9 @@ int openListener(std::string Host, std::string PortStr) {
 #endif // LLVM_ON_UNIX
 }
 
-// This must be kept in sync with gdb/gdb/jit.h .
-extern "C" {
-
-typedef enum {
-  JIT_NOACTION = 0,
-  JIT_REGISTER_FN,
-  JIT_UNREGISTER_FN
-} jit_actions_t;
-
-struct jit_code_entry {
-  struct jit_code_entry *next_entry;
-  struct jit_code_entry *prev_entry;
-  const char *symfile_addr;
-  uint64_t symfile_size;
-};
-
-struct jit_descriptor {
-  uint32_t version;
-  // This should be jit_actions_t, but we want to be specific about the
-  // bit-width.
-  uint32_t action_flag;
-  struct jit_code_entry *relevant_entry;
-  struct jit_code_entry *first_entry;
-};
-
-// We put information about the JITed function in this global, which the
-// debugger reads.  Make sure to specify the version statically, because the
-// debugger checks the version before we can set it during runtime.
-extern struct jit_descriptor __jit_debug_descriptor;
+// JITLink debug support plugins put information about JITed code in this GDB
+// JIT Interface global from OrcTargetProcess.
+extern "C" struct jit_descriptor __jit_debug_descriptor;
 
 static void *findLastDebugDescriptorEntryPtr() {
   struct jit_code_entry *Last = __jit_debug_descriptor.first_entry;
@@ -148,7 +122,6 @@ static void *findLastDebugDescriptorEntryPtr() {
     Last = Last->next_entry;
   return Last;
 }
-}
 
 int main(int argc, char *argv[]) {
 #if LLVM_ENABLE_THREADS
diff --git a/llvm/unittests/ExecutionEngine/Orc/OrcCAPITest.cpp b/llvm/unittests/ExecutionEngine/Orc/OrcCAPITest.cpp
index 12f42633d0372..061db56850ef5 100644
--- a/llvm/unittests/ExecutionEngine/Orc/OrcCAPITest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/OrcCAPITest.cpp
@@ -512,35 +512,9 @@ TEST_F(OrcCAPITestBase, AddObjectBuffer) {
   ASSERT_TRUE(!!SumAddr);
 }
 
-// This must be kept in sync with gdb/gdb/jit.h .
-extern "C" {
-
-typedef enum {
-  JIT_NOACTION = 0,
-  JIT_REGISTER_FN,
-  JIT_UNREGISTER_FN
-} jit_actions_t;
-
-struct jit_code_entry {
-  struct jit_code_entry *next_entry;
-  struct jit_code_entry *prev_entry;
-  const char *symfile_addr;
-  uint64_t symfile_size;
-};
-
-struct jit_descriptor {
-  uint32_t version;
-  // This should be jit_actions_t, but we want to be specific about the
-  // bit-width.
-  uint32_t action_flag;
-  struct jit_code_entry *relevant_entry;
-  struct jit_code_entry *first_entry;
-};
-
-// We put information about the JITed function in this global, which the
-// debugger reads.  Make sure to specify the version statically, because the
-// debugger checks the version before we can set it during runtime.
-extern struct jit_descriptor __jit_debug_descriptor;
+// JITLink debug support plugins put information about JITed code in this GDB
+// JIT Interface global from OrcTargetProcess.
+extern "C" struct jit_descriptor __jit_debug_descriptor;
 
 static void *findLastDebugDescriptorEntryPtr() {
   struct jit_code_entry *Last = __jit_debug_descriptor.first_entry;
@@ -548,7 +522,6 @@ static void *findLastDebugDescriptorEntryPtr() {
     Last = Last->next_entry;
   return Last;
 }
-}
 
 #if defined(_AIX) or not(defined(__ELF__) or defined(__MACH__))
 TEST_F(OrcCAPITestBase, DISABLED_EnableDebugSupport) {

From 665d1a0eb4e50cdc1b6f0f678a4c2b1e1875dc66 Mon Sep 17 00:00:00 2001
From: Mitch Phillips <mitchp@google.com>
Date: Wed, 3 Jan 2024 11:08:03 +0100
Subject: [PATCH 116/313] Revert "[WebAssembly][Object]Use file offset as
 function symbol address for linked files (#76198)"

This reverts commit fc5f51cf5af4364b38bf22e491d46e1e892ade0c.

Reason: Broke the sanitizer buildbot -
https://lab.llvm.org/buildbot/#/builders/5/builds/39751/steps/12/logs/stdio
---
 llvm/lib/Object/WasmObjectFile.cpp            | 16 +---
 llvm/test/tools/llvm-nm/wasm/linked.yaml      | 74 ------------------
 .../wasm/linked-symbol-table.yaml             | 75 -------------------
 3 files changed, 4 insertions(+), 161 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-nm/wasm/linked.yaml
 delete mode 100644 llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml

diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index ccc29d0cb73d6..40665d686cf93 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -1667,18 +1667,10 @@ Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
 Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
   auto &Sym = getWasmSymbol(Symb);
   if (Sym.Info.Kind == wasm::WASM_SYMBOL_TYPE_FUNCTION &&
-      isDefinedFunctionIndex(Sym.Info.ElementIndex)) {
-    // For object files, use the section offset. The linker relies on this.
-    // For linked files, use the file offset. This behavior matches the way
-    // browsers print stack traces and is useful for binary size analysis.
-    // (see https://webassembly.github.io/spec/web-api/index.html#conventions)
-    uint32_t Adjustment = isRelocatableObject() || isSharedObject()
-                              ? 0
-                              : Sections[CodeSection].Offset;
-    return getDefinedFunction(Sym.Info.ElementIndex).CodeSectionOffset +
-           Adjustment;
-  }
-  return getSymbolValue(Symb);
+      isDefinedFunctionIndex(Sym.Info.ElementIndex))
+    return getDefinedFunction(Sym.Info.ElementIndex).CodeSectionOffset;
+  else
+    return getSymbolValue(Symb);
 }
 
 uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const {
diff --git a/llvm/test/tools/llvm-nm/wasm/linked.yaml b/llvm/test/tools/llvm-nm/wasm/linked.yaml
deleted file mode 100644
index 992c1811743b7..0000000000000
--- a/llvm/test/tools/llvm-nm/wasm/linked.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-# RUN: yaml2obj %s -o %t.wasm
-# RUN: llvm-nm %t.wasm | FileCheck %s
-
-# CHECK: 0000009f T my_func_export
-# CHECK-NEXT: 0000002a D my_global_export
-# CHECK-NEXT: 00000000 D my_table_export
-
---- !WASM
-FileHeader:
-  Version:         0x1
-Sections:
-  - Type:            TYPE
-    Signatures:
-      - Index:           0
-        ParamTypes:      []
-        ReturnTypes:     []
-  - Type:            IMPORT
-    Imports:
-      - Module:          env
-        Field:           foo
-        Kind:            FUNCTION
-        SigIndex:        0
-      - Module:          env
-        Field:           bar
-        Kind:            GLOBAL
-        GlobalType:      I32
-        GlobalMutable:   true
-      - Module:          env
-        Field:           memory
-        Kind:            MEMORY
-        Memory:
-          Minimum:         0x1
-  - Type:            FUNCTION
-    FunctionTypes:   [ 0 ]
-  - Type:            TABLE
-    Tables:
-      - Index:           0
-        ElemType:        FUNCREF
-        Limits:
-          Flags:           [ HAS_MAX ]
-          Minimum:         0x1
-          Maximum:         0x1
-  - Type:            GLOBAL
-    Globals:
-      - Index:           1
-        Mutable:         false
-        Type:            I32
-        InitExpr:
-          Opcode:          I32_CONST
-          Value:           42
-  - Type:            EXPORT
-    Exports:
-      - Name:            my_func_export
-        Kind:            FUNCTION
-        Index:           1
-      - Name:            my_global_export
-        Kind:            GLOBAL
-        Index:           1
-      - Name:            my_table_export
-        Kind:            TABLE
-        Index:           0
-  - Type:            CODE
-    Functions:
-      - Index:           1
-        Locals:
-        Body:            00
-  - Type:            DATA
-    Segments:
-      - SectionOffset:   0
-        InitFlags:       0
-        Offset:
-          Opcode:          I32_CONST
-          Value:           0
-        Content:         ''
diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
deleted file mode 100644
index 6dd949a441496..0000000000000
--- a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-# RUN: yaml2obj %s -o %t.wasm
-# RUN: llvm-objdump -t %t.wasm | FileCheck %s
-#
-# CHECK:      SYMBOL TABLE:
-# CHECK-NEXT: 0000009f g F CODE my_func_export
-# CHECK-NEXT: 0000002a g O DATA my_global_export
-# CHECK-NEXT: 00000000 g   TABLE my_table_export
-
---- !WASM
-FileHeader:
-  Version:         0x1
-Sections:
-  - Type:            TYPE
-    Signatures:
-      - Index:           0
-        ParamTypes:      []
-        ReturnTypes:     []
-  - Type:            IMPORT
-    Imports:
-      - Module:          env
-        Field:           foo
-        Kind:            FUNCTION
-        SigIndex:        0
-      - Module:          env
-        Field:           bar
-        Kind:            GLOBAL
-        GlobalType:      I32
-        GlobalMutable:   true
-      - Module:          env
-        Field:           memory
-        Kind:            MEMORY
-        Memory:
-          Minimum:         0x1
-  - Type:            FUNCTION
-    FunctionTypes:   [ 0 ]
-  - Type:            TABLE
-    Tables:
-      - Index:           0
-        ElemType:        FUNCREF
-        Limits:
-          Flags:           [ HAS_MAX ]
-          Minimum:         0x1
-          Maximum:         0x1
-  - Type:            GLOBAL
-    Globals:
-      - Index:           1
-        Mutable:         false
-        Type:            I32
-        InitExpr:
-          Opcode:          I32_CONST
-          Value:           42
-  - Type:            EXPORT
-    Exports:
-      - Name:            my_func_export
-        Kind:            FUNCTION
-        Index:           1
-      - Name:            my_global_export
-        Kind:            GLOBAL
-        Index:           1
-      - Name:            my_table_export
-        Kind:            TABLE
-        Index:           0
-  - Type:            CODE
-    Functions:
-      - Index:           1
-        Locals:
-        Body:            00
-  - Type:            DATA
-    Segments:
-      - SectionOffset:   0
-        InitFlags:       0
-        Offset:
-          Opcode:          I32_CONST
-          Value:           0
-        Content:         ''

From 3db749afcb5079dac332942a3e0b258cdb629a02 Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Wed, 3 Jan 2024 18:23:45 +0800
Subject: [PATCH 117/313] [clang][analyzer] Improve 'errno' modeling of
 'mkdtemp' (#76671)

---
 .../Checkers/StdLibraryFunctionsChecker.cpp   |  7 +++--
 .../test/Analysis/errno-stdlibraryfunctions.c | 30 +++++++++++++++----
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 6560fd239ce66..20068653d530a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -2507,10 +2507,13 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
             .ArgConstraint(NotNull(ArgNo(0))));
 
     // char *mkdtemp(char *template);
-    // FIXME: Improve for errno modeling.
     addToFunctionSummaryMap(
         "mkdtemp", Signature(ArgTypes{CharPtrTy}, RetType{CharPtrTy}),
-        Summary(NoEvalCall).ArgConstraint(NotNull(ArgNo(0))));
+        Summary(NoEvalCall)
+            .Case({ReturnValueCondition(BO_EQ, ArgNo(0))},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({IsNull(Ret)}, ErrnoNEZeroIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(0))));
 
     // char *getcwd(char *buf, size_t size);
     // FIXME: Improve for errno modeling.
diff --git a/clang/test/Analysis/errno-stdlibraryfunctions.c b/clang/test/Analysis/errno-stdlibraryfunctions.c
index dafda764af9f3..80e14c4e2923c 100644
--- a/clang/test/Analysis/errno-stdlibraryfunctions.c
+++ b/clang/test/Analysis/errno-stdlibraryfunctions.c
@@ -7,12 +7,9 @@
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true
 
 #include "Inputs/errno_var.h"
+#include "Inputs/std-c-library-functions-POSIX.h"
 
-typedef typeof(sizeof(int)) size_t;
-typedef __typeof(sizeof(int)) off_t;
-typedef size_t ssize_t;
-ssize_t send(int sockfd, const void *buf, size_t len, int flags);
-off_t lseek(int fildes, off_t offset, int whence);
+#define NULL ((void *) 0)
 
 void clang_analyzer_warnIfReached();
 void clang_analyzer_eval(int);
@@ -54,3 +51,26 @@ int errno_lseek(int fildes, off_t offset) {
   }
   return 0;
 }
+
+void errno_mkstemp(char *template) {
+  int FD = mkstemp(template);
+  if (FD >= 0) {
+    if (errno) {}                    // expected-warning{{An undefined value may be read from 'errno'}}
+    close(FD);
+  } else {
+    clang_analyzer_eval(FD == -1);   // expected-warning{{TRUE}}
+    clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}}
+    if (errno) {}                    // no warning
+  }
+}
+
+void errno_mkdtemp(char *template) {
+  char *Dir = mkdtemp(template);
+  if (Dir == NULL) {
+    clang_analyzer_eval(errno != 0);      // expected-warning{{TRUE}}
+    if (errno) {}                         // no warning
+  } else {
+    clang_analyzer_eval(Dir == template); // expected-warning{{TRUE}}
+    if (errno) {}                         // expected-warning{{An undefined value may be read from 'errno'}}
+  }
+}

From 771fd1ad2a988e8bb586c2165d6877f06e5ed19b Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 3 Jan 2024 10:52:01 +0000
Subject: [PATCH 118/313] [DAG] Extend input types if needed in
 combineShiftToAVG. (#76791)

This atempts to fix #76734 which is a crash in invalid TRUNC nodes types
from unoptimized input code in combineShiftToAVG. The NVT can be VT if
the larger type was legal and the adds will not overflow, in which case
the inputs should be extended.

From what I can tell this appears to be valid (if not optimal for this
case): https://alive2.llvm.org/ce/z/fRieHR

The result has also been changed to getExtOrTrunc in case that VT==NVT,
which is not handled by SEXT/ZEXT.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp    |  7 +++----
 llvm/test/CodeGen/AArch64/arm64-vhadd.ll       | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4581bb19e97ec..66cdd75259087 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1064,10 +1064,9 @@ static SDValue combineShiftToAVG(SDValue Op, SelectionDAG &DAG,
 
   SDLoc DL(Op);
   SDValue ResultAVG =
-      DAG.getNode(AVGOpc, DL, NVT, DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpA),
-                  DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpB));
-  return DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT,
-                     ResultAVG);
+      DAG.getNode(AVGOpc, DL, NVT, DAG.getExtOrTrunc(IsSigned, ExtOpA, DL, NVT),
+                  DAG.getExtOrTrunc(IsSigned, ExtOpB, DL, NVT));
+  return DAG.getExtOrTrunc(IsSigned, ResultAVG, DL, VT);
 }
 
 /// Look at Op. At this point, we know that only the OriginalDemandedBits of the
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index e287eff5abb94..dda610e5dd3cb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1392,6 +1392,24 @@ define <8 x i8> @sextmask3v8i8(<8 x i16> %src1, <8 x i8> %src2) {
   ret <8 x i8> %result
 }
 
+define <4 x i16> @ext_via_i19(<4 x i16> %a) {
+; CHECK-LABEL: ext_via_i19:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi.4s v1, #1
+; CHECK-NEXT:    uaddw.4s v0, v1, v0
+; CHECK-NEXT:    uhadd.4s v0, v0, v1
+; CHECK-NEXT:    xtn.4h v0, v0
+; CHECK-NEXT:    ret
+  %t3 = zext <4 x i16> %a to <4 x i32>
+  %t4 = add <4 x i32> %t3, <i32 1, i32 1, i32 1, i32 1>
+  %t5 = trunc <4 x i32> %t4 to <4 x i19>
+  %new0 = add <4 x i19> %t5, <i19 1, i19 1, i19 1, i19 1>
+  %new1 = lshr <4 x i19> %new0, <i19 1, i19 1, i19 1, i19 1>
+  %last = zext <4 x i19> %new1 to <4 x i32>
+  %t6 = trunc <4 x i32> %last to <4 x i16>
+  ret <4 x i16> %t6
+}
+
 declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>)

From 4444a7e89a1e1c750f25e1673d54627a56fe0f9f Mon Sep 17 00:00:00 2001
From: ChipsSpectre <maxi.hornung@t-online.de>
Date: Wed, 3 Jan 2024 12:01:02 +0100
Subject: [PATCH 119/313] [InstSimplify] Simplify the expression `(a^c)&(a^~c)`
 to zero and (a^c) | (a^~c) to minus one (#76637)

Changes the InstSimplify pass of the LLVM optimizer, such that the
aforementioned expression is reduced to zero if c2==~c1.
Alive2: https://alive2.llvm.org/ce/z/xkQiid
Fixes https://github.com/llvm/llvm-project/issues/75692.
---
 llvm/lib/Analysis/InstructionSimplify.cpp     | 12 ++++++
 .../Transforms/InstCombine/and-xor-merge.ll   | 39 +++++++++++++++++++
 llvm/test/Transforms/InstCombine/or-xor.ll    | 39 +++++++++++++++++++
 3 files changed, 90 insertions(+)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 78a8334763340..241bdd81b75a9 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2204,6 +2204,13 @@ static Value *simplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       match(Op1, m_c_Xor(m_Specific(Or), m_Specific(Y))))
     return Constant::getNullValue(Op0->getType());
 
+  const APInt *C1;
+  Value *A;
+  // (A ^ C) & (A ^ ~C) -> 0
+  if (match(Op0, m_Xor(m_Value(A), m_APInt(C1))) &&
+      match(Op1, m_Xor(m_Specific(A), m_SpecificInt(~*C1))))
+    return Constant::getNullValue(Op0->getType());
+
   if (Op0->getType()->isIntOrIntVectorTy(1)) {
     if (std::optional<bool> Implied = isImpliedCondition(Op0, Op1, Q.DL)) {
       // If Op0 is true implies Op1 is true, then Op0 is a subset of Op1.
@@ -2473,6 +2480,11 @@ static Value *simplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     if (Value *V = threadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
       return V;
 
+  // (A ^ C) | (A ^ ~C) -> -1, i.e. all bits set to one.
+  if (match(Op0, m_Xor(m_Value(A), m_APInt(C1))) &&
+      match(Op1, m_Xor(m_Specific(A), m_SpecificInt(~*C1))))
+    return Constant::getAllOnesValue(Op0->getType());
+
   if (Op0->getType()->isIntOrIntVectorTy(1)) {
     if (std::optional<bool> Implied =
             isImpliedCondition(Op0, Op1, Q.DL, false)) {
diff --git a/llvm/test/Transforms/InstCombine/and-xor-merge.ll b/llvm/test/Transforms/InstCombine/and-xor-merge.ll
index 543336468ff0a..e6df4e32bae36 100644
--- a/llvm/test/Transforms/InstCombine/and-xor-merge.ll
+++ b/llvm/test/Transforms/InstCombine/and-xor-merge.ll
@@ -40,3 +40,42 @@ define i32 @PR38781(i32 %a, i32 %b) {
   %and = and i32 %b.lobit.not, %a.lobit.not
   ret i32 %and
 }
+
+; (a ^ 4) & (a ^ ~4) -> 0
+define i32 @PR75692_1(i32 %x) {
+; CHECK-LABEL: @PR75692_1
+; CHECK-NEXT:  ret i32 0
+;
+  %t2 = xor i32 %x, 4
+  %t3 = xor i32 %x, -5
+  %t4 = and i32 %t2, %t3
+  ret i32 %t4
+}
+
+; (a ^ 4) & (a ^ 3) is not zero
+define i32 @PR75692_2(i32 %x) {
+; CHECK-LABEL: @PR75692_2
+; CHECK-NEXT:  %t2 = xor i32 %x, 4
+; CHECK-NEXT:  %t3 = xor i32 %x, -4
+; CHECK-NEXT:  %t4 = and i32 %t2, %t3
+; CHECK-NEXT:  ret i32 %t4
+;
+  %t2 = xor i32 %x, 4
+  %t3 = xor i32 %x, -4
+  %t4 = and i32 %t2, %t3
+  ret i32 %t4
+}
+
+; (a ^ 4) & (b ^ ~4) is not zero, since a != b is possible
+define i32 @PR75692_3(i32 %x, i32 %y) {
+; CHECK-LABEL: @PR75692_3
+; CHECK-NEXT:  %t2 = xor i32 %x, 4
+; CHECK-NEXT:  %t3 = xor i32 %y, -5
+; CHECK-NEXT:  %t4 = and i32 %t2, %t3
+; CHECK-NEXT:  ret i32 %t4
+;
+  %t2 = xor i32 %x, 4
+  %t3 = xor i32 %y, -5
+  %t4 = and i32 %t2, %t3
+  ret i32 %t4
+}
diff --git a/llvm/test/Transforms/InstCombine/or-xor.ll b/llvm/test/Transforms/InstCombine/or-xor.ll
index 1d35332d061bb..361aab6c21e27 100644
--- a/llvm/test/Transforms/InstCombine/or-xor.ll
+++ b/llvm/test/Transforms/InstCombine/or-xor.ll
@@ -1055,3 +1055,42 @@ define i8 @or_nand_xor_common_op_commute3_use3(i8 %x, i8 %y, i8 %z) {
   %r = or i8 %xor, %nand
   ret i8 %r
 }
+
+; (a ^ 4) & (a ^ ~4) -> -1
+define i32 @PR75692_1(i32 %x) {
+; CHECK-LABEL: @PR75692_1(
+; CHECK-NEXT:  ret i32 -1
+;
+  %t2 = xor i32 %x, 4
+  %t3 = xor i32 %x, -5
+  %t4 = or i32 %t2, %t3
+  ret i32 %t4
+}
+
+; (a ^ 4) & (a ^ 3) is not -1
+define i32 @PR75692_2(i32 %x) {
+; CHECK-LABEL: @PR75692_2
+; CHECK-NEXT:  %t2 = xor i32 %x, 4
+; CHECK-NEXT:  %t3 = xor i32 %x, -4
+; CHECK-NEXT:  %t4 = or i32 %t2, %t3
+; CHECK-NEXT:  ret i32 %t4
+;
+  %t2 = xor i32 %x, 4
+  %t3 = xor i32 %x, -4
+  %t4 = or i32 %t2, %t3
+  ret i32 %t4
+}
+
+; (a ^ 4) & (b ^ ~4) is not -1, since a != b is possible
+define i32 @PR75692_3(i32 %x, i32 %y) {
+; CHECK-LABEL: @PR75692_3
+; CHECK-NEXT:  %t2 = xor i32 %x, 4
+; CHECK-NEXT:  %t3 = xor i32 %y, -5
+; CHECK-NEXT:  %t4 = or i32 %t2, %t3
+; CHECK-NEXT:  ret i32 %t4
+;
+  %t2 = xor i32 %x, 4
+  %t3 = xor i32 %y, -5
+  %t4 = or i32 %t2, %t3
+  ret i32 %t4
+}

From 58a335a3f12d55f48b86c03a33f414f593eb38d0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 2 Jan 2024 17:44:59 +0000
Subject: [PATCH 120/313] [X86] Fold concat_vectors(permq(x),permq(x)) ->
 permq(concat_vectors(x,x))

Handle a common subvector shuffle pattern in combineConcatVectorOps
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  8 +++
 .../vector-interleaved-store-i8-stride-7.ll   | 16 ++---
 .../zero_extend_vector_inreg_of_broadcast.ll  | 64 +++++++++----------
 3 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1e4b1361f98a6..e643e54496f70 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54572,6 +54572,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
           Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
         return Op0.getOperand(0);
     }
+
+    // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
+    if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
+        !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
+      return DAG.getNode(Op0.getOpcode(), DL, VT,
+                         DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                                     Op0.getOperand(0), Op0.getOperand(0)),
+                         Op0.getOperand(1));
   }
 
   // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index d6cd02709e6a0..9f292dc64b532 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -725,19 +725,17 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
 ; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-SLOW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm1
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-SLOW-NEXT:    vporq %zmm1, %zmm2, %zmm1
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-SLOW-NEXT:    vporq %zmm2, %zmm1, %zmm1
 ; AVX512BW-SLOW-NEXT:    movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
 ; AVX512BW-SLOW-NEXT:    kmovq %rcx, %k1
 ; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 86e878261dcce..a7b1fa24ce17d 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -5684,10 +5684,9 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -5797,10 +5796,9 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -5910,10 +5908,9 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.v
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -6004,10 +6001,9 @@ define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -6211,12 +6207,12 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63]
+; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -6330,12 +6326,12 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63]
+; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -6449,12 +6445,12 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq

From 39be138cb771f36a13ccbf96b55ae025ba821841 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 3 Jan 2024 11:08:59 +0000
Subject: [PATCH 121/313] [X86] combineTargetShuffle - fold
 SHUF128(CONCAT(),CONCAT()) to peek through upper subvectors

If SHUF128 is accessing only the upper half of a vector source that is a concatenation/insert_subvector then try to access the subvector directly and adjust the element mask accordingly.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  28 +
 .../vector-interleaved-load-i16-stride-4.ll   | 575 +++++++--------
 .../vector-interleaved-load-i8-stride-4.ll    |  14 +-
 .../vector-interleaved-store-i16-stride-7.ll  | 671 +++++++++---------
 .../vector-interleaved-store-i8-stride-7.ll   | 144 ++--
 5 files changed, 713 insertions(+), 719 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e643e54496f70..3ca1c808fff7a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40221,6 +40221,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     }
     return SDValue();
   }
+  case X86ISD::SHUF128: {
+    // If we're permuting the upper 256-bits subvectors of a concatenation, then
+    // see if we can peek through and access the subvector directly.
+    if (VT.is512BitVector()) {
+      // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
+      // upper subvector is used.
+      SDValue LHS = N->getOperand(0);
+      SDValue RHS = N->getOperand(1);
+      uint64_t Mask = N->getConstantOperandVal(2);
+      SmallVector<SDValue> LHSOps, RHSOps;
+      SDValue NewLHS, NewRHS;
+      if ((Mask & 0x0A) == 0x0A &&
+          collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
+        NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
+        Mask &= ~0x0A;
+      }
+      if ((Mask & 0xA0) == 0xA0 &&
+          collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
+        NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
+        Mask &= ~0xA0;
+      }
+      if (NewLHS || NewRHS)
+        return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
+                           NewRHS ? NewRHS : RHS,
+                           DAG.getTargetConstant(Mask, DL, MVT::i8));
+    }
+    return SDValue();
+  }
   case X86ISD::VPERM2X128: {
     // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
     SDValue LHS = N->getOperand(0);
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 22e353f2502d6..44f2d58cb4e96 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -2258,7 +2258,6 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-SLOW-NEXT:    vpmovqw %zmm1, %xmm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm3
 ; AVX512F-SLOW-NEXT:    vpmovqw %ymm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm10
@@ -2273,7 +2272,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
 ; AVX512F-SLOW-NEXT:    vpmovqw %zmm0, %xmm11
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm10
 ; AVX512F-SLOW-NEXT:    vmovdqa 80(%rdi), %xmm11
 ; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm13
@@ -2292,7 +2291,6 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm1, %zmm9
 ; AVX512F-SLOW-NEXT:    vpmovqw %zmm9, %xmm9
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
@@ -2307,7 +2305,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm0, %zmm9
 ; AVX512F-SLOW-NEXT:    vpmovqw %zmm9, %xmm9
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
@@ -2324,7 +2322,6 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm1, %zmm13
 ; AVX512F-SLOW-NEXT:    vpmovqw %zmm13, %xmm13
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm12
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
@@ -2341,7 +2338,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm0, %zmm14
 ; AVX512F-SLOW-NEXT:    vpmovqw %zmm14, %xmm14
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
@@ -2354,7 +2351,6 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm1, %zmm1
 ; AVX512F-SLOW-NEXT:    vpmovqw %zmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
@@ -2367,7 +2363,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm0, %zmm0
 ; AVX512F-SLOW-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm2, (%rsi)
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm5, (%rdx)
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm12, (%rcx)
@@ -2393,7 +2389,6 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm7, %ymm11, %ymm10
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm1, %xmm7
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %ymm10
 ; AVX512F-FAST-NEXT:    vpermd %ymm10, %ymm4, %ymm12
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm12, %ymm13
@@ -2403,7 +2398,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm13, %ymm11, %ymm4
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm0, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm5, %ymm13
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
@@ -2412,14 +2407,13 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm1, %zmm13
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm13, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm12, %ymm12
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm15, %ymm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm0, %zmm13
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm13, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm6, %ymm12, %ymm6
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm6, %ymm13
@@ -2429,7 +2423,6 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpsrlq $32, %zmm1, %zmm13
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm13, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm13
 ; AVX512F-FAST-NEXT:    vpermd %ymm10, %ymm12, %ymm10
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
 ; AVX512F-FAST-NEXT:    vpermd %ymm14, %ymm12, %ymm12
@@ -2438,21 +2431,20 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpsrlq $32, %zmm0, %zmm2
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm2, %xmm2
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm6, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm8, %ymm6
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %zmm1, %zmm1
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm1, %xmm1
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm10, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm12, %ymm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, (%rsi)
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm2, (%rcx)
@@ -4909,285 +4901,276 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-SLOW-LABEL: load_i16_stride4_vf64:
 ; AVX512F-SLOW:       # %bb.0:
-; AVX512F-SLOW-NEXT:    subq $104, %rsp
-; AVX512F-SLOW-NEXT:    vmovdqa 240(%rdi), %xmm0
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa 224(%rdi), %xmm2
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    subq $200, %rsp
+; AVX512F-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm26
+; AVX512F-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm27
+; AVX512F-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm28
+; AVX512F-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm29
+; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512F-SLOW-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa 240(%rdi), %xmm14
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa 224(%rdi), %xmm13
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 112(%rdi), %xmm20
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm20[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 496(%rdi), %xmm19
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm19[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[0,1,0,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 480(%rdi), %xmm21
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm21[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 368(%rdi), %xmm17
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm4[0,1,0,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 352(%rdi), %xmm24
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm24[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm18 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX512F-SLOW-NEXT:    vmovdqa64 320(%rdi), %xmm25
-; AVX512F-SLOW-NEXT:    vmovdqa64 336(%rdi), %xmm29
-; AVX512F-SLOW-NEXT:    vmovdqa 448(%rdi), %xmm11
-; AVX512F-SLOW-NEXT:    vmovdqa64 464(%rdi), %xmm16
-; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512F-SLOW-NEXT:    vmovdqa 80(%rdi), %xmm12
-; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm13
-; AVX512F-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm6
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm27 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm15[0,1,1,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm29, %xmm1
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512F-SLOW-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-SLOW-NEXT:    vmovdqa 112(%rdi), %xmm12
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdi), %xmm11
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm28, %xmm4
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm0
+; AVX512F-SLOW-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa64 496(%rdi), %xmm24
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 480(%rdi), %xmm23
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm27, %xmm1
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm1
+; AVX512F-SLOW-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-SLOW-NEXT:    vmovdqa64 368(%rdi), %xmm31
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 352(%rdi), %xmm25
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm26, %xmm1
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
+; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512F-SLOW-NEXT:    vmovdqa64 336(%rdi), %xmm17
+; AVX512F-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm18
+; AVX512F-SLOW-NEXT:    vmovdqa64 464(%rdi), %xmm19
+; AVX512F-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm20
+; AVX512F-SLOW-NEXT:    vmovdqa64 80(%rdi), %xmm21
+; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512F-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm1
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm29, %zmm6
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm6, %xmm6
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm22 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm26 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm29[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm25[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm28, %zmm6
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm6, %xmm6
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
+; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm2[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm28 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm31 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm20[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,1,2,0,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm23 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[2,0,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm13 = xmm21[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm27, %zmm3
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm3, %xmm3
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm26, %zmm4
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
+; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm2, %xmm22
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm21 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm16[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[2,0,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm19 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm17[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm24[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm29[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm25[3,1,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[2,0,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512F-SLOW-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-SLOW-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm0, %xmm4
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm5
-; AVX512F-SLOW-NEXT:    vpmovqw %ymm5, %xmm5
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm29
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm29, %xmm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm4
-; AVX512F-SLOW-NEXT:    vpmovqw %ymm4, %xmm4
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-SLOW-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm25
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm25, %xmm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm5
-; AVX512F-SLOW-NEXT:    vpmovqw %ymm5, %xmm5
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm18
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm18, %xmm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm5[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm0, %zmm5
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm5, %xmm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm29, %zmm8
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm27 = zmm5[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm25, %zmm5
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm5, %xmm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm0, %ymm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $16, %zmm18, %zmm8
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm5[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm14[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm29, %zmm1
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm0, %ymm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm0, %zmm8
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm23, %ymm0, %ymm2
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm29, %zmm8
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm3
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm25, %zmm5
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm5, %xmm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
-; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm9
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm18, %zmm9
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm9, %xmm9
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm8[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm12[0,1,3,1,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm0, %zmm0
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm29, %zmm5
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm5, %xmm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm1 = mem[0,1,3,1,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm5 = mem[0,1,3,1,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm25, %zmm5
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm5, %xmm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512F-SLOW-NEXT:    vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm5 = mem[3,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm6 = mem[3,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm0, %xmm16
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm28, %zmm8
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm8, %xmm8
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
+; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
+; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm14
+; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm5
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm27, %zmm14
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm14, %xmm14
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm14
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm26, %zmm14
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm14, %xmm14
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm3
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm29, %zmm3
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm3, %xmm3
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm28, %zmm3
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm3, %xmm3
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm18, %zmm5
-; AVX512F-SLOW-NEXT:    vpmovqw %zmm5, %xmm5
-; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm24, 64(%rsi)
-; AVX512F-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vmovaps %zmm4, (%rsi)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm22, 64(%rdx)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm27, (%rdx)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rcx)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm2, (%rcx)
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm27, %zmm2
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm2, %xmm2
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512F-SLOW-NEXT:    vpsrlq $48, %zmm26, %zmm3
+; AVX512F-SLOW-NEXT:    vpmovqw %zmm3, %xmm3
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
+; AVX512F-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vmovaps %zmm2, 64(%rsi)
+; AVX512F-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vmovaps %zmm2, (%rsi)
+; AVX512F-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vmovaps %zmm2, 64(%rdx)
+; AVX512F-SLOW-NEXT:    vmovups (%rsp), %zmm2 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vmovaps %zmm2, (%rdx)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
+; AVX512F-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vmovaps %zmm2, (%rcx)
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm1, 64(%r8)
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm0, (%r8)
-; AVX512F-SLOW-NEXT:    addq $104, %rsp
+; AVX512F-SLOW-NEXT:    addq $200, %rsp
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
@@ -5211,7 +5194,6 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm0, %ymm7, %ymm3
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm4, %xmm0
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 96(%rdi), %ymm27
 ; AVX512F-FAST-NEXT:    vpermd %ymm27, %ymm1, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm3, %ymm9
@@ -5221,7 +5203,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm9, %ymm7, %ymm12
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm30, %xmm9
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm0[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa64 480(%rdi), %ymm16
 ; AVX512F-FAST-NEXT:    vpermd %ymm16, %ymm1, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm9
@@ -5231,7 +5213,6 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm9, %ymm7, %ymm13
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm26, %xmm9
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
 ; AVX512F-FAST-NEXT:    vmovdqa64 352(%rdi), %ymm18
 ; AVX512F-FAST-NEXT:    vpermd %ymm18, %ymm1, %ymm13
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm13, %ymm14
@@ -5241,7 +5222,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm14, %ymm7, %ymm15
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm23, %xmm14
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm10, %ymm14
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
@@ -5250,28 +5231,26 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm4, %zmm14
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm14, %xmm14
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm30, %zmm8
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm8, %xmm8
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm11[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm11[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm12, %ymm3
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm26, %zmm3
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm3, %xmm3
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm13, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm23, %zmm3
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm3, %xmm3
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm24, %ymm15, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm3, %ymm0
@@ -5280,8 +5259,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm0, %ymm7, %ymm1
 ; AVX512F-FAST-NEXT:    vpsrlq $32, %zmm4, %zmm0
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1
+; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm27, %ymm15, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm13
 ; AVX512F-FAST-NEXT:    vpermd %ymm28, %ymm15, %ymm12
@@ -5290,7 +5268,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpsrlq $32, %zmm30, %zmm13
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm13, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm1[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpermd %ymm16, %ymm15, %ymm13
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm13, %ymm1
 ; AVX512F-FAST-NEXT:    vpermd %ymm17, %ymm15, %ymm14
@@ -5298,8 +5276,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm1, %ymm7, %ymm11
 ; AVX512F-FAST-NEXT:    vpsrlq $32, %zmm26, %zmm1
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm11
+; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm18, %ymm15, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm2
 ; AVX512F-FAST-NEXT:    vpermd %ymm20, %ymm15, %ymm5
@@ -5308,35 +5285,33 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpsrlq $32, %zmm23, %zmm2
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm2, %xmm2
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm8, %ymm6
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %zmm4, %zmm4
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm4, %xmm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm12, %ymm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %zmm30, %zmm4
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm4, %xmm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm13, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm14, %ymm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %zmm26, %zmm4
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm4, %xmm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm5, %ymm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %zmm23, %zmm4
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm4, %xmm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm22, 64(%rsi)
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm21, (%rsi)
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index e2195f1fc25a1..e2311246e711f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -1943,8 +1943,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12]
 ; AVX512F-NEXT:    vpermt2d %ymm5, %ymm1, %ymm6
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX512F-NEXT:    vpshufb %ymm7, %ymm5, %ymm9
 ; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm6
@@ -1952,7 +1951,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpermt2d %ymm9, %ymm1, %ymm7
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[0,1,2,3]
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
 ; AVX512F-NEXT:    vpshufb %ymm8, %ymm4, %ymm10
@@ -1960,14 +1959,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpsrld $8, %zmm2, %zmm9
 ; AVX512F-NEXT:    vpmovdb %zmm9, %xmm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
 ; AVX512F-NEXT:    vpshufb %ymm8, %ymm5, %ymm10
 ; AVX512F-NEXT:    vpshufb %ymm8, %ymm6, %ymm8
 ; AVX512F-NEXT:    vpermt2d %ymm10, %ymm1, %ymm8
 ; AVX512F-NEXT:    vpsrld $8, %zmm0, %zmm10
 ; AVX512F-NEXT:    vpmovdb %zmm10, %xmm10
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[0,1,2,3]
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-NEXT:    vpshufb %ymm9, %ymm3, %ymm10
 ; AVX512F-NEXT:    vpshufb %ymm9, %ymm4, %ymm11
@@ -1975,14 +1973,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpsrld $16, %zmm2, %zmm10
 ; AVX512F-NEXT:    vpmovdb %zmm10, %xmm10
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512F-NEXT:    vpshufb %ymm9, %ymm5, %ymm11
 ; AVX512F-NEXT:    vpshufb %ymm9, %ymm6, %ymm9
 ; AVX512F-NEXT:    vpermt2d %ymm11, %ymm1, %ymm9
 ; AVX512F-NEXT:    vpsrld $16, %zmm0, %zmm11
 ; AVX512F-NEXT:    vpmovdb %zmm11, %xmm11
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[0,1,2,3]
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
@@ -1990,14 +1987,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpsrld $24, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm5, %ymm3
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm6, %ymm4
 ; AVX512F-NEXT:    vpermt2d %ymm3, %ymm1, %ymm4
 ; AVX512F-NEXT:    vpsrld $24, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rsi)
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rdx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, (%rcx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index d253dd117b109..f3ec4420e4dda 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -5100,7 +5100,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
@@ -5139,8 +5139,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm20, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm15
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm15[1,1,2,2]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[1,1,2,2]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3],xmm0[4],xmm10[5,6],xmm0[7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm26
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
@@ -5165,13 +5165,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rax), %ymm6
 ; AVX512F-SLOW-NEXT:    vpermd %zmm6, %zmm3, %zmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm28, %ymm7
-; AVX512F-SLOW-NEXT:    vpshufb %ymm7, %ymm6, %ymm11
+; AVX512F-SLOW-NEXT:    vpshufb %ymm7, %ymm6, %ymm10
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm6
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm6
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-SLOW-NEXT:    vpshufb %xmm7, %xmm2, %xmm11
+; AVX512F-SLOW-NEXT:    vpshufb %xmm7, %xmm2, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm13
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rsi), %xmm14
@@ -5195,10 +5195,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm28 = ymm30[2,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm29 = ymm23[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm19[2,1,3,3]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm30 = ymm18[0,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
@@ -5217,21 +5217,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6,7,8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14,15]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm12
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm12, %zmm10, %zmm7
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm12, %zmm11, %zmm7
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm30, %zmm0
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm31, %zmm1
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm10, %zmm1
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm1
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 32-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm9 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm10, %zmm9
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm9
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm11[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm11 = mem[2,1,3,2]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm10 = mem[2,1,3,2]
 ; AVX512F-SLOW-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    # ymm12 = mem[2,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
@@ -5258,7 +5258,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
 ; AVX512F-SLOW-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm9
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm9
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9
 ; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm15[2,1,3,2]
@@ -5269,7 +5269,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm17, %zmm1
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm13, %zmm18, %zmm8
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm1, %zmm10, %zmm8
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm8
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 36(%rax), %ymm1
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 40(%rax), %ymm9
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
@@ -5281,9 +5281,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm28, %zmm9, %zmm9
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm8
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm29[0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm8
@@ -10685,14 +10684,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-SLOW-LABEL: store_i16_stride7_vf64:
 ; AVX512F-SLOW:       # %bb.0:
-; AVX512F-SLOW-NEXT:    subq $2200, %rsp # imm = 0x898
+; AVX512F-SLOW-NEXT:    subq $2168, %rsp # imm = 0x878
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rcx), %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdx), %ymm9
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdi), %ymm7
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rsi), %ymm8
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm11, %ymm9, %ymm2
 ; AVX512F-SLOW-NEXT:    vporq %ymm1, %ymm2, %ymm16
@@ -10705,16 +10704,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%r9), %ymm2
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa %ymm3, %ymm10
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm21
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%r8), %ymm3
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm15, %ymm3, %ymm2
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm3, %ymm20
 ; AVX512F-SLOW-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm2
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm23
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm6
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm11, %ymm6, %ymm2
 ; AVX512F-SLOW-NEXT:    vpor %ymm1, %ymm2, %ymm1
@@ -10744,10 +10743,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rsi), %ymm2
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm12, %ymm2, %ymm1
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm10, %ymm25
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm10, %ymm22
 ; AVX512F-SLOW-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rcx), %ymm2
@@ -10780,7 +10779,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
 ; AVX512F-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm19, %ymm14
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm18, %ymm14
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm12, %ymm14, %ymm10
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm12, %ymm19
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6]
@@ -10814,54 +10813,53 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm8
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm17, %zmm7
 ; AVX512F-SLOW-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vmovdqa 96(%r8), %ymm12
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vmovdqa 96(%r8), %ymm10
+; AVX512F-SLOW-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17,u,u,u,u],zero,zero
+; AVX512F-SLOW-NEXT:    vpternlogq $248, %ymm26, %ymm7, %ymm8
+; AVX512F-SLOW-NEXT:    vmovdqa 96(%r9), %ymm12
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17,u,u,u,u],zero,zero
-; AVX512F-SLOW-NEXT:    vpternlogq $248, %ymm17, %ymm7, %ymm8
-; AVX512F-SLOW-NEXT:    vmovdqa 96(%r9), %ymm14
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufb %ymm3, %ymm14, %ymm9
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $248, %ymm10, %ymm8, %ymm9
+; AVX512F-SLOW-NEXT:    vpshufb %ymm3, %ymm12, %ymm9
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $248, %ymm14, %ymm8, %ymm9
 ; AVX512F-SLOW-NEXT:    vextracti64x4 $1, %zmm7, %ymm7
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
 ; AVX512F-SLOW-NEXT:    vpternlogq $184, %ymm7, %ymm10, %ymm8
-; AVX512F-SLOW-NEXT:    vprold $16, %ymm14, %ymm7
+; AVX512F-SLOW-NEXT:    vprold $16, %ymm12, %ymm7
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %ymm8, %ymm10, %ymm7
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm16
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm7
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,2,3],zmm7[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512F-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
+; AVX512F-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rax), %ymm7
-; AVX512F-SLOW-NEXT:    vpermd %zmm7, %zmm8, %zmm9
+; AVX512F-SLOW-NEXT:    vpermd %zmm7, %zmm18, %zmm9
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm7[0,1,1,3,4,5,5,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpandnq %ymm9, %ymm17, %ymm9
+; AVX512F-SLOW-NEXT:    vpandnq %ymm9, %ymm26, %ymm9
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 72(%rax), %ymm7
-; AVX512F-SLOW-NEXT:    vpandn %ymm7, %ymm14, %ymm9
+; AVX512F-SLOW-NEXT:    vpandnq %ymm7, %ymm16, %ymm9
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rax), %ymm7
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm10, %ymm7, %ymm12
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 8(%rax), %ymm9
-; AVX512F-SLOW-NEXT:    vpandn %ymm9, %ymm14, %ymm9
-; AVX512F-SLOW-NEXT:    vmovdqa (%rax), %ymm14
-; AVX512F-SLOW-NEXT:    vpshufb %ymm10, %ymm14, %ymm12
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm14, %ymm22
+; AVX512F-SLOW-NEXT:    vpandnq %ymm9, %ymm16, %ymm9
+; AVX512F-SLOW-NEXT:    vmovdqa (%rax), %ymm8
+; AVX512F-SLOW-NEXT:    vpshufb %ymm10, %ymm8, %ymm12
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm8, %ymm24
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
@@ -10878,35 +10876,35 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm10, %ymm9, %ymm10
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[0,1,1,3,4,5,5,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpandnq %ymm12, %ymm17, %ymm12
+; AVX512F-SLOW-NEXT:    vpandnq %ymm12, %ymm26, %ymm12
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,0,2,1,4,4,6,5]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512F-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm19, %ymm13
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm2, %ymm10
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
+; AVX512F-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm31
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpermd %zmm9, %zmm8, %zmm0
+; AVX512F-SLOW-NEXT:    vpermd %zmm9, %zmm18, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
@@ -10918,31 +10916,31 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm18, %ymm2
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm23, %ymm2
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm30
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm23, %ymm3
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm20, %ymm8
-; AVX512F-SLOW-NEXT:    vprold $16, %ymm20, %ymm0
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm24[1,2,2,3,5,6,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm21, %ymm8
+; AVX512F-SLOW-NEXT:    vprold $16, %ymm21, %ymm0
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm20[1,2,2,3,5,6,6,7]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm24[0,0,2,1,4,4,6,5]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm20[0,0,2,1,4,4,6,5]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10]
 ; AVX512F-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
 ; AVX512F-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
 ; AVX512F-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpermd 64(%rax), %zmm11, %zmm0
-; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm17, %zmm0
+; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm26, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
@@ -10954,61 +10952,61 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm24[3,3,3,3,7,7,7,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm20[3,3,3,3,7,7,7,7]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%r9), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%r8), %xmm1
-; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm12, %zmm2
+; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm12, %zmm3
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm7[2,3,3,3,6,7,7,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 96(%rax), %ymm5
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm4
+; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa 96(%rsi), %xmm2
+; AVX512F-SLOW-NEXT:    vmovdqa 96(%rsi), %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512F-SLOW-NEXT:    vprold $16, %xmm2, %xmm5
+; AVX512F-SLOW-NEXT:    vprold $16, %xmm3, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm18
-; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm23
+; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512F-SLOW-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-SLOW-NEXT:    vmovdqa 96(%rcx), %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdx), %xmm4
 ; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm2, %xmm5
+; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm3, %xmm5
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm6, %xmm7
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512F-SLOW-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,7,6]
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3>
-; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm28, %zmm1
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3>
+; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm29, %zmm1
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 100(%rax), %ymm2
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 104(%rax), %ymm3
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm16, %zmm2
+; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm14, %zmm2
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm3, %xmm30
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm3, %xmm27
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm7, %xmm8
@@ -11023,37 +11021,37 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm6, %xmm26
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm6, %xmm25
 ; AVX512F-SLOW-NEXT:    vprold $16, %xmm4, %xmm4
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm4
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm4
-; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
+; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa 64(%r8), %xmm4
-; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm16
-; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,7,6]
+; AVX512F-SLOW-NEXT:    vmovdqa 64(%r8), %xmm2
+; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm17
+; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u>
-; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm6, %zmm1
-; AVX512F-SLOW-NEXT:    vpbroadcastd 64(%rax), %ymm4
+; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm6, %zmm1
+; AVX512F-SLOW-NEXT:    vpbroadcastd 64(%rax), %ymm2
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 68(%rax), %ymm5
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm4
-; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
+; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdx), %xmm5
-; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm4, %xmm19
+; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm2, %xmm20
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm8, %xmm29
+; AVX512F-SLOW-NEXT:    vmovdqa %xmm8, %xmm14
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3],xmm1[4],xmm5[5,6],xmm1[7]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,3,2,4,5,6,7]
@@ -11063,150 +11061,153 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm2, %xmm19
 ; AVX512F-SLOW-NEXT:    vprold $16, %xmm5, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm3
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm3
-; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm2
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
+; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6]
+; AVX512F-SLOW-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm6, %zmm1
-; AVX512F-SLOW-NEXT:    vpbroadcastd (%rax), %ymm2
-; AVX512F-SLOW-NEXT:    vpbroadcastd 4(%rax), %ymm3
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-SLOW-NEXT:    vpermt2d %zmm3, %zmm6, %zmm1
+; AVX512F-SLOW-NEXT:    vpbroadcastd (%rax), %ymm3
+; AVX512F-SLOW-NEXT:    vpbroadcastd 4(%rax), %ymm4
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm2
 ; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm23, %ymm7
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm28, %ymm7
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm25[1,1,1,1,5,5,5,5]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm22[1,1,1,1,5,5,5,5]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm22
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm25[2,2,2,2,6,6,6,6]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm27
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-SLOW-NEXT:    vprold $16, %ymm2, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512F-SLOW-NEXT:    vprold $16, %ymm3, %ymm1
-; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[1,2,2,3,5,6,6,7]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[0,0,2,1,4,4,6,5]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7,8,9,10],ymm6[11],ymm2[12,13],ymm6[14],ymm2[15]
-; AVX512F-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm2
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[1,2,2,3,5,6,6,7]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8,9,10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15]
+; AVX512F-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm4
 ; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
 ; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm16
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[2,2,2,2,6,6,6,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm18
 ; AVX512F-SLOW-NEXT:    vpermd (%rax), %zmm11, %zmm1
-; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm2, %zmm17, %zmm1
+; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm4, %zmm26, %zmm1
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6,7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14,15]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm25[3,3,3,3,7,7,7,7]
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512F-SLOW-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512F-SLOW-NEXT:    vmovdqa 32(%r8), %xmm4
+; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm11, %zmm12, %zmm1
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm22[2,3,3,3,6,7,7,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm24[2,3,3,3,6,7,7,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 32(%rax), %ymm12
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm25
-; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm15, %zmm25
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm26
+; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm1, %zmm15, %zmm26
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm12
 ; AVX512F-SLOW-NEXT:    vprold $16, %xmm12, %xmm15
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[1,1,2,3]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7]
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7]
+; AVX512F-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm16, %xmm1
+; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm1
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm18, %xmm4
+; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX512F-SLOW-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm23 = mem[2,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm22 = mem[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm18 = mem[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm24 = mem[2,2,2,3]
+; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm23 = mem[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm7 = mem[0,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    # ymm17 = mem[2,1,3,3]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512F-SLOW-NEXT:    vpermt2d %zmm0, %zmm28, %zmm2
+; AVX512F-SLOW-NEXT:    vpermt2d %zmm0, %zmm29, %zmm4
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 36(%rax), %ymm0
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 40(%rax), %ymm6
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm28
-; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm28
+; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm28
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
+; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2]
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX512F-SLOW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0],xmm4[1],xmm12[2,3],xmm4[4],xmm12[5,6],xmm4[7]
+; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm29 = ymm0[3,3,3,3]
-; AVX512F-SLOW-NEXT:    vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm2 = mem[1,2,2,3,5,6,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm10 = ymm4[2,1,2,3,6,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm10[2,2,2,2]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm0 = mem[1,2,2,3,5,6,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,1,2,3,6,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm14[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm27, %xmm1
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm1[0,2,3,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm4
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[2,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm1
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,1,2,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm19, %xmm4
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm1[0,2,3,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm14[2,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm19, %xmm1
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
 ; AVX512F-SLOW-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
@@ -11215,176 +11216,174 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    # ymm5 = mem[2,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    # ymm6 = mem[2,2,3,3]
-; AVX512F-SLOW-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm7 = mem[2,2,2,3]
-; AVX512F-SLOW-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm1 = mem[2,2,2,3]
+; AVX512F-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-SLOW-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm1 = mem[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm30 = mem[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm26 = mem[2,1,3,3]
-; AVX512F-SLOW-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm19 = mem[2,2,2,3]
+; AVX512F-SLOW-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm27 = mem[2,1,3,3]
+; AVX512F-SLOW-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm20 = mem[2,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    # ymm11 = mem[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm16 = mem[2,1,3,2]
-; AVX512F-SLOW-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm7 = mem[2,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm27[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm24[2,1,3,2]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm21[2,2,2,3]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm22, %zmm23, %zmm22
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm23
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm22, %zmm27, %zmm23
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %ymm23, %ymm17, %ymm2
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %ymm2, %ymm17, %ymm0
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm19 = ymm30[2,1,3,2]
+; AVX512F-SLOW-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm1 = mem[2,2,2,3]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm25 = ymm22[2,2,2,3]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm21[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,1,3,2]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm18[2,2,2,3]
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm23
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm17, %zmm7, %zmm24
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm23, %zmm30, %zmm24
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %ymm24, %ymm7, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm14
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm2 # 64-byte Folded Reload
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm10, %zmm22, %zmm8
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm10, %zmm23, %zmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
 ; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm8
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 32-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm22, %zmm10
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm10
 ; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm3, %zmm17, %zmm10
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm2
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm18
-; AVX512F-SLOW-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm18
-; AVX512F-SLOW-NEXT:    vextracti64x4 $1, %zmm23, %ymm2
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm1, %ymm2
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm21
+; AVX512F-SLOW-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21
+; AVX512F-SLOW-NEXT:    vextracti64x4 $1, %zmm24, %ymm2
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512F-SLOW-NEXT:    vpternlogq $184, %ymm2, %ymm3, %ymm29
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm21 # 64-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm2
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 64-byte Folded Reload
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm0[0,1,2,3],zmm6[0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm2
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm23
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm14[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512F-SLOW-NEXT:    vpternlogd $184, %zmm2, %zmm0, %zmm29
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 64-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm11, %zmm19, %zmm0
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm16, %zmm2
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm2
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm13, %zmm20, %zmm0
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm14, %zmm4
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm4
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm2, %zmm0, %zmm19
+; AVX512F-SLOW-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 64-byte Folded Reload
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm11, %zmm20, %zmm2
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm19, %zmm3
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm3
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm13, %zmm25, %zmm2
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm5
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm5
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm20
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm3, %zmm2, %zmm20
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm25
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm31, %zmm0, %zmm0
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm26, %zmm30, %zmm2
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm0, %zmm27, %zmm2
-; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm0 = mem[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm3 = mem[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512F-SLOW-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm3 = mem[2,1,3,3]
-; AVX512F-SLOW-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm4 = mem[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-SLOW-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm4 = mem[0,0,2,1]
-; AVX512F-SLOW-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm5 = mem[2,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
-; AVX512F-SLOW-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm26 = mem[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm7 = mem[0,2,3,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm9 = mem[0,2,2,3]
-; AVX512F-SLOW-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm11 = mem[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm12 = mem[2,1,3,3]
-; AVX512F-SLOW-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm13 = mem[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # ymm14 = mem[0,0,2,1]
-; AVX512F-SLOW-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512F-SLOW-NEXT:    # xmm15 = mem[2,1,2,3,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3]
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm27, %zmm31, %zmm3
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm2, %zmm30, %zmm3
+; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm2 = mem[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # xmm4 = mem[0,1,3,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512F-SLOW-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm4 = mem[2,1,3,3]
+; AVX512F-SLOW-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm5 = mem[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512F-SLOW-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm5 = mem[0,0,2,1]
+; AVX512F-SLOW-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # xmm6 = mem[2,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512F-SLOW-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm19 = mem[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # xmm9 = mem[0,2,3,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512F-SLOW-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm11 = mem[0,2,2,3]
+; AVX512F-SLOW-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # xmm12 = mem[0,1,3,2,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm13 = mem[2,1,3,3]
+; AVX512F-SLOW-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm14 = mem[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # ymm15 = mem[0,0,2,1]
+; AVX512F-SLOW-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512F-SLOW-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
 ; AVX512F-SLOW-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    # ymm16 = mem[0,0,1,1]
 ; AVX512F-SLOW-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    # xmm1 = mem[0,2,3,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm27, %zmm3
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm11[0,1,1,3]
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm0
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm9
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm27, %zmm9
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm3, %zmm0, %zmm11
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm9, %zmm0, %zmm25
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm0
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm26, %zmm3
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm22, %zmm3
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm0
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm4
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm12[0,1,1,3]
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm11, %zmm2
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm11
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm11
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm12
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm11, %zmm2, %zmm26
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm2
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm19, %zmm4
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm4
+; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm16, %zmm1
-; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm22, %zmm1
+; AVX512F-SLOW-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm3, %zmm0, %zmm4
+; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm2
 ; AVX512F-SLOW-NEXT:    vpternlogq $184, %zmm1, %zmm0, %zmm28
-; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm24
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24
+; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
 ; AVX512F-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm24, 320(%rax)
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm28, 256(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm25, 192(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm20, 128(%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm26, 192(%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm25, 128(%rax)
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm17, 448(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm4, 704(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm11, 640(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm19, 576(%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm17, (%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm18, 448(%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm2, 704(%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm12, 640(%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm20, 576(%rax)
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm8, 512(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm0, 384(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm21, 768(%rax)
-; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm23, 832(%rax)
-; AVX512F-SLOW-NEXT:    addq $2200, %rsp # imm = 0x898
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm7, 384(%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm22, 768(%rax)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm29, 832(%rax)
+; AVX512F-SLOW-NEXT:    addq $2168, %rsp # imm = 0x878
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 9f292dc64b532..b7baf7b40d714 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -7625,17 +7625,15 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
 ; AVX512F-SLOW-NEXT:    vporq %ymm15, %ymm18, %ymm5
 ; AVX512F-SLOW-NEXT:    vporq %ymm19, %ymm20, %ymm6
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-SLOW-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm26
 ; AVX512F-SLOW-NEXT:    vporq %ymm21, %ymm22, %ymm1
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm1[0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
 ; AVX512F-SLOW-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
@@ -7704,9 +7702,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm26
-; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
+; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512F-FAST-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa (%r8), %ymm0
@@ -7751,78 +7749,78 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vporq %xmm9, %xmm12, %xmm22
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm13, %ymm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm13, %ymm20
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm14, %ymm2
-; AVX512F-FAST-NEXT:    vpor %ymm7, %ymm2, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
+; AVX512F-FAST-NEXT:    vpor %ymm7, %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm16, %ymm7
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18]
+; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm17, %ymm7
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,14],zero,ymm7[12,13,0,1,14,15],zero,ymm7[3,12,13,2,3,16],zero,ymm7[30,31,28,29,16,17],zero,ymm7[31,18,19,28,29,18],zero
-; AVX512F-FAST-NEXT:    vpor %ymm2, %ymm7, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm18, %ymm2
+; AVX512F-FAST-NEXT:    vpor %ymm1, %ymm7, %ymm1
+; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-FAST-NEXT:    vmovdqa64 %ymm18, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm25, %ymm7
-; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
+; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm30, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512F-FAST-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %xmm13
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm13, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
-; AVX512F-FAST-NEXT:    vporq %xmm0, %xmm2, %xmm31
+; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm9, %xmm1
+; AVX512F-FAST-NEXT:    vporq %xmm0, %xmm1, %xmm31
 ; AVX512F-FAST-NEXT:    vmovdqa (%rcx), %xmm14
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm14, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
-; AVX512F-FAST-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512F-FAST-NEXT:    vmovdqa64 %xmm19, %xmm1
+; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX512F-FAST-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa %xmm2, %xmm3
-; AVX512F-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-FAST-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm1, %xmm0
+; AVX512F-FAST-NEXT:    vmovdqa %xmm1, %xmm3
+; AVX512F-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa (%r8), %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX512F-FAST-NEXT:    vmovdqa64 %xmm21, %xmm1
+; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512F-FAST-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm23, %ymm12
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero
+; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm26, %ymm11
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm11[25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20]
+; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512F-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
 ; AVX512F-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm19
-; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm2
+; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm2, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm5, %ymm30
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[4,5,6,7]
+; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,2,3,3,2,2,3,3]
-; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3]
+; AVX512F-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa (%rax), %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa (%rax), %ymm4
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7830,52 +7828,52 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm5, %ymm18
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm24
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
+; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14]
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14]
 ; AVX512F-FAST-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm25
+; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm25
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29]
+; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX512F-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm26 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
 ; AVX512F-FAST-NEXT:    # ymm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-FAST-NEXT:    vpternlogq $248, %ymm26, %ymm0, %ymm1
+; AVX512F-FAST-NEXT:    vpternlogq $248, %ymm26, %ymm0, %ymm2
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[0,1,0,1]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm2[0,1,2,3],zmm0[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm28, %xmm1
-; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15]
+; AVX512F-FAST-NEXT:    vmovdqa64 %xmm28, %xmm2
+; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm28
+; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm28
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa %xmm6, %xmm1
+; AVX512F-FAST-NEXT:    vmovdqa %xmm6, %xmm2
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm27
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm22[0,1,0,1],zmm1[0,1,0,1]
+; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm27
+; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
+; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm22[0,1,0,1],zmm2[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rax), %xmm0
-; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm0, %xmm29
-; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm0
+; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm1, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm11, %ymm5
@@ -8063,9 +8061,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0
 ; AVX512F-FAST-NEXT:    vpor %ymm12, %ymm15, %ymm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
 ; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm16
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16
 ; AVX512F-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
@@ -8077,8 +8074,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2
 ; AVX512F-FAST-NEXT:    vpor %ymm1, %ymm13, %ymm1
 ; AVX512F-FAST-NEXT:    vpor %ymm11, %ymm14, %ymm5
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm1[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm20, %zmm4
 ; AVX512F-FAST-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4

From 43a5c4a10d19e7ecca4232966495aabc4e901559 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 3 Jan 2024 11:24:58 +0000
Subject: [PATCH 122/313] [lldb][test] Skip other Global Module Cache tests on
 Arm/AArch64 Linux

These are expected to fail but sometimes crash during the test leaving them
as unresolved.

Same failure message and likely same cause as the other test in this file.
---
 .../API/python_api/global_module_cache/TestGlobalModuleCache.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py b/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
index 5b6e9e8a588a3..1264df61f2be4 100644
--- a/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
+++ b/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
@@ -52,11 +52,13 @@ def test_OneTargetOneDebugger(self):
     # This test tests for the desired behavior as an expected fail.
     @skipIfWindows
     @expectedFailureAll
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
     def test_TwoTargetsOneDebugger(self):
         self.do_test(False, True)
 
     @skipIfWindows
     @expectedFailureAll
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
     def test_OneTargetTwoDebuggers(self):
         self.do_test(True, False)
 

From aba40fb34a27f1d36b4b541bf04d414dca7d1c4c Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena <usx@google.com>
Date: Wed, 3 Jan 2024 12:41:10 +0100
Subject: [PATCH 123/313] [coroutines] Do not check coroutine wrappers for
 skipped function bodies (#76729)

Without function bodies, we cannot tell whether a function is a
coroutine or not.
The analysis of coroutine wrappers is not useful when this information
is not available.

We therefore now skip this analysis for skipped function bodies.
---
 .../clangd/unittests/DiagnosticsTests.cpp     | 56 +++++++++++++++++++
 clang/lib/Sema/SemaDecl.cpp                   |  5 +-
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index 37643e5afa230..f302dcf5f09db 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -420,6 +420,62 @@ TEST(DiagnosticTest, MakeUnique) {
                        "no matching constructor for initialization of 'S'")));
 }
 
+TEST(DiagnosticTest, CoroutineInHeader) {
+  StringRef CoroutineH = R"cpp(
+namespace std {
+template <class Ret, typename... T>
+struct coroutine_traits { using promise_type = typename Ret::promise_type; };
+
+template <class Promise = void>
+struct coroutine_handle {
+  static coroutine_handle from_address(void *) noexcept;
+  static coroutine_handle from_promise(Promise &promise);
+  constexpr void* address() const noexcept;
+};
+template <>
+struct coroutine_handle<void> {
+  template <class PromiseType>
+  coroutine_handle(coroutine_handle<PromiseType>) noexcept;
+  static coroutine_handle from_address(void *);
+  constexpr void* address() const noexcept;
+};
+
+struct awaitable {
+  bool await_ready() noexcept { return false; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+} // namespace std
+  )cpp";
+
+  StringRef Header = R"cpp(
+#include "coroutine.h"
+template <typename T> struct [[clang::coro_return_type]] Gen {
+  struct promise_type {
+    Gen<T> get_return_object() {
+      return {};
+    }
+    std::awaitable  initial_suspend();
+    std::awaitable  final_suspend() noexcept;
+    void unhandled_exception();
+    void return_value(T t);
+  };
+};
+
+Gen<int> foo_coro(int b) { co_return b; }
+  )cpp";
+  Annotations Main(R"cpp(
+// error-ok
+#include "header.hpp"
+Gen<int> $[[bar_coro]](int b) { return foo_coro(b); }
+  )cpp");
+  TestTU TU = TestTU::withCode(Main.code());
+  TU.AdditionalFiles["coroutine.h"] = std::string(CoroutineH);
+  TU.AdditionalFiles["header.hpp"] = std::string(Header);
+  TU.ExtraArgs.push_back("--std=c++20");
+  EXPECT_THAT(TU.build().getDiagnostics(), ElementsAre(hasRange(Main.range())));
+}
+
 TEST(DiagnosticTest, MakeShared) {
   // We usually miss diagnostics from header functions as we don't parse them.
   // std::make_shared is only parsed when --parse-forwarding-functions is set
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index ffbe317d55999..2de631941325f 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -15845,8 +15845,6 @@ static void diagnoseImplicitlyRetainedSelf(Sema &S) {
 }
 
 void Sema::CheckCoroutineWrapper(FunctionDecl *FD) {
-  if (!FD)
-    return;
   RecordDecl *RD = FD->getReturnType()->getAsRecordDecl();
   if (!RD || !RD->getUnderlyingDecl()->hasAttr<CoroReturnTypeAttr>())
     return;
@@ -15869,7 +15867,8 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
   sema::AnalysisBasedWarnings::Policy WP = AnalysisWarnings.getDefaultPolicy();
   sema::AnalysisBasedWarnings::Policy *ActivePolicy = nullptr;
 
-  if (getLangOpts().Coroutines) {
+  // If we skip function body, we can't tell if a function is a coroutine.
+  if (getLangOpts().Coroutines && FD && !FD->hasSkippedBody()) {
     if (FSI->isCoroutine())
       CheckCompletedCoroutineBody(FD, Body);
     else

From df1b5ae31de5166cd525d4365bcbab9c58dea0f6 Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam@amd.com>
Date: Wed, 3 Jan 2024 17:53:47 +0530
Subject: [PATCH 124/313] [AMDGPU][GlobalISel] Update tests to check for COV5
 (#76257)

Update GlobalISel tests to assume ABI to be code object version 5.
---
 .../GlobalISel/crash-stack-address-O0.ll      |    5 +-
 .../GlobalISel/dropped_debug_info_assert.ll   |   88 +-
 .../GlobalISel/irtranslator-assert-align.ll   |   19 +-
 .../GlobalISel/irtranslator-atomicrmw.ll      |    7 +-
 .../irtranslator-call-abi-attribute-hints.ll  |  111 +-
 .../irtranslator-call-implicit-args.ll        |  954 ++--
 .../irtranslator-call-return-values.ll        | 3204 +++++++------
 .../GlobalISel/irtranslator-call-sret.ll      |   66 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    | 4091 ++++++++---------
 .../irtranslator-constant-fold-vector-op.ll   |    7 +-
 .../GlobalISel/irtranslator-indirect-call.ll  |   66 +-
 .../GlobalISel/irtranslator-inline-asm.ll     |  116 +-
 .../GlobalISel/irtranslator-sibling-call.ll   |   39 +-
 .../GlobalISel/irtranslator-tail-call.ll      |    7 +-
 .../GlobalISel/llvm.amdgcn.is.private.ll      |   14 +-
 .../GlobalISel/llvm.amdgcn.is.shared.ll       |   14 +-
 .../AMDGPU/GlobalISel/non-entry-alloca.ll     |   14 +-
 .../AMDGPU/addrspacecast-constantexpr.ll      |   19 +-
 18 files changed, 4383 insertions(+), 4458 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
index 0df80d67e7715..9580326d7b78f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
@@ -7,7 +7,7 @@
 define amdgpu_kernel void @stack_write_fi() {
 ; CHECK-LABEL: stack_write_fi:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_add_u32 s0, s0, s17
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
@@ -23,3 +23,6 @@ entry:
   store volatile i64 0, ptr addrspace(5) %alloca, align 4
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll
index c4e383c3708b3..44c4910bac7ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll
@@ -7,48 +7,47 @@ declare void @callee()
 define amdgpu_kernel void @call_debug_loc() {
   ; CHECK-LABEL: name: call_debug_loc
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2, debug-location !6
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1, debug-location !6
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0, debug-location !6
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16, debug-location !6
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15, debug-location !6
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14, debug-location !6
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11, debug-location !6
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7, debug-location !6
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5, debug-location !6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, debug-location !6
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_64 = COPY [[COPY8]], debug-location !6
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sreg_64 = COPY [[COPY7]], debug-location !6
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sreg_64 = COPY [[COPY6]], debug-location !6
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY5]], debug-location !6
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !6
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !6
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !6
-  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10, debug-location !6
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], debug-location !6
-  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[COPY1]], implicit $exec, debug-location !6
-  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 20, debug-location !6
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]], debug-location !6
-  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY17]], [[COPY]], implicit $exec, debug-location !6
-  ; CHECK-NEXT:   [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY2]], [[V_LSHLREV_B32_e64_]], [[V_LSHLREV_B32_e64_1]], implicit $exec, debug-location !6
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !6
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]], debug-location !6
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]], debug-location !6
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]], debug-location !6
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY9]], debug-location !6
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]], debug-location !6
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY13]], debug-location !6
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY14]], debug-location !6
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]], debug-location !6
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]], debug-location !6
-  ; CHECK-NEXT:   $vgpr31 = COPY [[V_OR3_B32_e64_]], debug-location !6
-  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee, target-flags(amdgpu-gotprel32-hi) @callee, implicit-def $scc, debug-location !6
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0, debug-location !6 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @callee, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, debug-location !6
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, debug-location !6
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2, debug-location !7
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1, debug-location !7
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0, debug-location !7
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14, debug-location !7
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13, debug-location !7
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12, debug-location !7
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9, debug-location !7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5, debug-location !7
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, debug-location !7
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_64 = COPY [[COPY7]], debug-location !7
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF debug-location !7
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_64 = COPY [[COPY6]], debug-location !7
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]], debug-location !7
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !7
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !7
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !7
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10, debug-location !7
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], debug-location !7
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY1]], implicit $exec, debug-location !7
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 20, debug-location !7
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]], debug-location !7
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[COPY]], implicit $exec, debug-location !7
+  ; CHECK-NEXT:   [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY2]], [[V_LSHLREV_B32_e64_]], [[V_LSHLREV_B32_e64_1]], implicit $exec, debug-location !7
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !7
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]], debug-location !7
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]], debug-location !7
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]], debug-location !7
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY8]], debug-location !7
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]], debug-location !7
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY11]], debug-location !7
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY12]], debug-location !7
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY13]], debug-location !7
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]], debug-location !7
+  ; CHECK-NEXT:   $vgpr31 = COPY [[V_OR3_B32_e64_]], debug-location !7
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee, target-flags(amdgpu-gotprel32-hi) @callee, implicit-def $scc, debug-location !7
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0, debug-location !7 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @callee, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, debug-location !7
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, debug-location !7
   ; CHECK-NEXT:   S_ENDPGM 0
 entry:
   call void @callee(), !dbg !6
@@ -60,11 +59,11 @@ define void @returnaddress_debug_loc(ptr addrspace(1) %ptr) {
   ; CHECK: bb.1.entry:
   ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31, debug-location !6
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31, debug-location !7
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]], debug-location !6
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]], debug-location !7
   ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE]], [[COPY3]], 0, 0, implicit $exec :: (store (p0) into %ir.ptr, addrspace 1)
   ; CHECK-NEXT:   SI_RETURN
 entry:
@@ -78,7 +77,7 @@ declare ptr @llvm.returnaddress(i32 immarg) #0
 attributes #0 = { nofree nosync nounwind readnone willreturn }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3, !4, !5}
+!llvm.module.flags = !{!2, !3, !4, !5, !10}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, producer: "clang version 14.0.0 (git@github.com:llvm/llvm-project.git 4132dc917eddb446405cc5afef41167b8bce360b)", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None)
 !1 = !DIFile(filename: "gisel_1_gfx1031.cl", directory: "/home/matt/builds/conformance/2.0")
@@ -90,3 +89,4 @@ attributes #0 = { nofree nosync nounwind readnone willreturn }
 !7 = distinct !DISubprogram(name: "call_debug_loc", scope: !1, file: !1, line: 8, type: !8, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
 !8 = !DISubroutineType(types: !9)
 !9 = !{}
+!10 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
index a2f1308fdbedd..83c331c5573c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
@@ -32,13 +32,13 @@ define void @call_result_align_1() {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -81,13 +81,13 @@ define void @call_result_align_8() {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -131,13 +131,13 @@ define void @declaration_result_align_8() {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr_align8
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -181,11 +181,11 @@ define ptr addrspace(1) @tail_call_assert_align() {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @returns_ptr_align8
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -209,3 +209,6 @@ entry:
   %call = tail call ptr addrspace(1) @returns_ptr_align8()
   ret ptr addrspace(1) %call
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
index 9359bde339bb4..8069698526bf0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
@@ -30,8 +30,8 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) {
   ; CHECK-NEXT: bb.2.atomicrmw.start:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s64) = G_PHI %16(s64), %bb.2, [[C1]](s64), %bb.1
-  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %14(s32), %bb.2
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s64) = G_PHI %15(s64), %bb.2, [[C1]](s64), %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %13(s32), %bb.2
   ; CHECK-NEXT:   [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]]
   ; CHECK-NEXT:   [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
@@ -48,3 +48,6 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) {
   %oldval = atomicrmw fsub ptr addrspace(3) %addr, float 1.0 seq_cst
   ret float %oldval
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
index 4cf02c935f1b0..ca889de30c650 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
@@ -9,37 +9,36 @@ declare hidden void @extern()
 define amdgpu_kernel void @kernel_call_no_workitem_ids() {
   ; CHECK-LABEL: name: kernel_call_no_workitem_ids
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY8]](p4)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY7]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY6]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY11]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY12]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY13]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY8]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY9]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY10]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY11]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
@@ -50,40 +49,39 @@ define amdgpu_kernel void @kernel_call_no_workitem_ids() {
 define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
   ; CHECK-LABEL: name: kernel_call_no_workgroup_ids
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY7]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY11]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY8]](p4)
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY6]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY8]](s64)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -95,12 +93,12 @@ define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
 define amdgpu_kernel void @kernel_call_no_other_sgprs() {
   ; CHECK-LABEL: name: kernel_call_no_other_sgprs
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(p4) = COPY [[COPY3]](p4)
@@ -139,12 +137,12 @@ define void @func_call_no_workitem_ids() {
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
@@ -177,12 +175,12 @@ define void @func_call_no_workgroup_ids() {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]](p4)
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]]
@@ -226,3 +224,6 @@ define void @func_call_no_other_sgprs() {
   call void @extern() "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z"
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
index 711e1b7cb213a..213598b2e8126 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
@@ -11,52 +11,51 @@ declare hidden void @external_void_func_v32i32(<32 x i32>) #0
 define amdgpu_kernel void @test_call_external_void_func_i32([17 x i8]) #0 {
   ; GFX900-LABEL: name: test_call_external_void_func_i32
   ; GFX900: bb.1 (%ir-block.1):
-  ; GFX900-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX900-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 20
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX900-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GFX900-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GFX900-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GFX900-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GFX900-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX900-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; GFX900-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; GFX900-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GFX900-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX900-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GFX900-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX900-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -64,52 +63,51 @@ define amdgpu_kernel void @test_call_external_void_func_i32([17 x i8]) #0 {
   ;
   ; GFX908-LABEL: name: test_call_external_void_func_i32
   ; GFX908: bb.1 (%ir-block.1):
-  ; GFX908-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX908-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 20
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX908-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GFX908-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GFX908-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GFX908-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX908-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; GFX908-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; GFX908-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GFX908-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX908-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX908-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -130,13 +128,13 @@ define void @test_func_call_external_void_func_i32() #0 {
   ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 99
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
   ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -171,13 +169,13 @@ define void @test_func_call_external_void_func_i32() #0 {
   ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 99
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
   ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -209,41 +207,40 @@ define void @test_func_call_external_void_func_i32() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX900-LABEL: name: test_call_external_void_func_v32i32
   ; GFX900: bb.1 (%ir-block.1):
-  ; GFX900-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX900-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
   ; GFX900-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 20
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX900-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GFX900-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GFX900-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GFX900-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GFX900-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX900-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; GFX900-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; GFX900-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GFX900-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
   ; GFX900-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
@@ -281,16 +278,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX900-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; GFX900-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; GFX900-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; GFX900-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GFX900-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX900-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 4, implicit-def $scc
@@ -298,41 +295,40 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
   ;
   ; GFX908-LABEL: name: test_call_external_void_func_v32i32
   ; GFX908: bb.1 (%ir-block.1):
-  ; GFX908-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX908-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
   ; GFX908-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 20
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX908-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GFX908-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GFX908-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GFX908-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX908-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; GFX908-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; GFX908-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GFX908-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
   ; GFX908-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
@@ -370,16 +366,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX908-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; GFX908-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; GFX908-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; GFX908-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX908-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 4, implicit-def $scc
@@ -400,7 +396,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX900-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
@@ -458,7 +454,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
   ; GFX900-NEXT:   [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GFX900-NEXT:   [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX900-NEXT:   [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX900-NEXT:   [[COPY28:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GFX900-NEXT:   [[COPY29:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GFX900-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -528,7 +524,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
@@ -586,7 +582,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
   ; GFX908-NEXT:   [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GFX908-NEXT:   [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX908-NEXT:   [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX908-NEXT:   [[COPY28:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GFX908-NEXT:   [[COPY29:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GFX908-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -651,84 +647,82 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
 define amdgpu_kernel void @test_only_workitem_id_x() #0 !reqd_work_group_size !0 {
   ; GFX900-LABEL: name: test_only_workitem_id_x
   ; GFX900: bb.1 (%ir-block.0):
-  ; GFX900-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64)
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX900-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF]](s32)
-  ; GFX900-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY9]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY10]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY11]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY12]](s32)
+  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GFX900-NEXT:   $vgpr31 = COPY [[COPY13]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GFX900-NEXT:   S_ENDPGM 0
   ;
   ; GFX908-LABEL: name: test_only_workitem_id_x
   ; GFX908: bb.1 (%ir-block.0):
-  ; GFX908-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64)
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF]](s32)
-  ; GFX908-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY9]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY10]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY11]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY12]](s32)
+  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GFX908-NEXT:   $vgpr31 = COPY [[COPY13]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_ENDPGM 0
@@ -739,45 +733,44 @@ define amdgpu_kernel void @test_only_workitem_id_x() #0 !reqd_work_group_size !0
 define amdgpu_kernel void @test_only_workitem_id_y() #0 !reqd_work_group_size !1 {
   ; GFX900-LABEL: name: test_only_workitem_id_y
   ; GFX900: bb.1 (%ir-block.0):
-  ; GFX900-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
-  ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64)
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX900-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX900-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C3]](s32)
+  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C3]](s32)
   ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
   ; GFX900-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY9]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY10]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY11]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY12]](s32)
+  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX900-NEXT:   $vgpr31 = COPY [[OR]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -785,45 +778,44 @@ define amdgpu_kernel void @test_only_workitem_id_y() #0 !reqd_work_group_size !1
   ;
   ; GFX908-LABEL: name: test_only_workitem_id_y
   ; GFX908: bb.1 (%ir-block.0):
-  ; GFX908-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64)
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX908-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C3]](s32)
+  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C3]](s32)
   ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
   ; GFX908-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY9]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY10]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY11]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY12]](s32)
+  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX908-NEXT:   $vgpr31 = COPY [[OR]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -835,45 +827,44 @@ define amdgpu_kernel void @test_only_workitem_id_y() #0 !reqd_work_group_size !1
 define amdgpu_kernel void @test_only_workitem_id_z() #0 !reqd_work_group_size !2 {
   ; GFX900-LABEL: name: test_only_workitem_id_z
   ; GFX900: bb.1 (%ir-block.0):
-  ; GFX900-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
-  ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64)
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX900-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX900-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C3]](s32)
+  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C3]](s32)
   ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
   ; GFX900-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY9]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY10]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY11]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY12]](s32)
+  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX900-NEXT:   $vgpr31 = COPY [[OR]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -881,45 +872,44 @@ define amdgpu_kernel void @test_only_workitem_id_z() #0 !reqd_work_group_size !2
   ;
   ; GFX908-LABEL: name: test_only_workitem_id_z
   ; GFX908: bb.1 (%ir-block.0):
-  ; GFX908-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64)
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX908-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C3]](s32)
+  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C3]](s32)
   ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
   ; GFX908-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY9]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY10]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY11]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY12]](s32)
+  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX908-NEXT:   $vgpr31 = COPY [[OR]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -931,46 +921,45 @@ define amdgpu_kernel void @test_only_workitem_id_z() #0 !reqd_work_group_size !2
 define amdgpu_kernel void @test_only_workitem_id_xy() #0 !reqd_work_group_size !3 {
   ; GFX900-LABEL: name: test_only_workitem_id_xy
   ; GFX900: bb.1 (%ir-block.0):
-  ; GFX900-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64)
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-  ; GFX900-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64)
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX900-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
-  ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY16]], [[SHL]]
+  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C2]](s32)
+  ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL]]
   ; GFX900-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX900-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY11]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY12]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY13]](s32)
+  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX900-NEXT:   $vgpr31 = COPY [[OR]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -978,46 +967,45 @@ define amdgpu_kernel void @test_only_workitem_id_xy() #0 !reqd_work_group_size !
   ;
   ; GFX908-LABEL: name: test_only_workitem_id_xy
   ; GFX908: bb.1 (%ir-block.0):
-  ; GFX908-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64)
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-  ; GFX908-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64)
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
-  ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY16]], [[SHL]]
+  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C2]](s32)
+  ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL]]
   ; GFX908-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY11]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY12]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY13]](s32)
+  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX908-NEXT:   $vgpr31 = COPY [[OR]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1029,50 +1017,49 @@ define amdgpu_kernel void @test_only_workitem_id_xy() #0 !reqd_work_group_size !
 define amdgpu_kernel void @test_only_workitem_id_yz() #0 !reqd_work_group_size !4 {
   ; GFX900-LABEL: name: test_only_workitem_id_yz
   ; GFX900: bb.1 (%ir-block.0):
-  ; GFX900-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
-  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64)
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64)
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX900-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX900-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GFX900-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32)
+  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[C3]](s32)
   ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
-  ; GFX900-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX900-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32)
+  ; GFX900-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C4]](s32)
   ; GFX900-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GFX900-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX900-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY11]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY12]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY13]](s32)
+  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX900-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1080,50 +1067,49 @@ define amdgpu_kernel void @test_only_workitem_id_yz() #0 !reqd_work_group_size !
   ;
   ; GFX908-LABEL: name: test_only_workitem_id_yz
   ; GFX908: bb.1 (%ir-block.0):
-  ; GFX908-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
-  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64)
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64)
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX908-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GFX908-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32)
+  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[C3]](s32)
   ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
-  ; GFX908-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX908-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32)
+  ; GFX908-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C4]](s32)
   ; GFX908-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GFX908-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY11]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY12]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY13]](s32)
+  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX908-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1135,46 +1121,45 @@ define amdgpu_kernel void @test_only_workitem_id_yz() #0 !reqd_work_group_size !
 define amdgpu_kernel void @test_only_workitem_id_xz() #0 !reqd_work_group_size !5 {
   ; GFX900-LABEL: name: test_only_workitem_id_xz
   ; GFX900: bb.1 (%ir-block.0):
-  ; GFX900-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX900-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX900-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64)
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX900-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-  ; GFX900-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64)
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX900-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
-  ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY16]], [[SHL]]
+  ; GFX900-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C2]](s32)
+  ; GFX900-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL]]
   ; GFX900-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX900-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY11]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY12]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY13]](s32)
+  ; GFX900-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX900-NEXT:   $vgpr31 = COPY [[OR]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1182,46 +1167,45 @@ define amdgpu_kernel void @test_only_workitem_id_xz() #0 !reqd_work_group_size !
   ;
   ; GFX908-LABEL: name: test_only_workitem_id_xz
   ; GFX908: bb.1 (%ir-block.0):
-  ; GFX908-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64)
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-  ; GFX908-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64)
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
-  ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY16]], [[SHL]]
+  ; GFX908-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C2]](s32)
+  ; GFX908-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL]]
   ; GFX908-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY11]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY12]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY13]](s32)
+  ; GFX908-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GFX908-NEXT:   $vgpr31 = COPY [[OR]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1237,9 +1221,11 @@ declare i32 @llvm.amdgcn.workitem.id.z() #1
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable willreturn }
 
+!llvm.module.flags = !{!6}
 !0 = !{i32 64, i32 1, i32 1}
 !1 = !{i32 1, i32 64, i32 1}
 !2 = !{i32 1, i32 1, i32 64}
 !3 = !{i32 32, i32 2, i32 1}
 !4 = !{i32 1, i32 32, i32 2}
 !5 = !{i32 32, i32 1, i32 2}
+!6 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 609883c190223..8b0a006e29c00 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -70,58 +70,57 @@ declare hidden i32 @external_gfx_i32_func_i32(i32) #0
 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 {
   ; GCN-LABEL: name: test_call_external_i32_func_i32_imm
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GCN-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p1) from %ir.out.kernarg.offset1, align 16, addrspace 4)
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i32_func_i32
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GCN-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i32_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY21]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out.load, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY19]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call i32 @external_i32_func_i32(i32 42)
   store volatile i32 %val, ptr addrspace(1) %out
@@ -155,54 +154,53 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_i32_imm(ptr addrspace(1)
 define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i1_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i1_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -233,54 +231,53 @@ define amdgpu_gfx void @test_gfx_call_external_i1_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i1_zeroext_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i1_zeroext_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY21]], 1
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 1
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
@@ -295,54 +292,53 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i1_signext_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i1_signext_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY21]], 1
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 1
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
@@ -357,54 +353,53 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i8_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i8_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i8_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i8_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32)
   ; GCN-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
@@ -437,54 +432,53 @@ define amdgpu_gfx void @test_gfx_call_external_i8_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i8_zeroext_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i8_zeroext_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i8_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY21]], 8
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 8
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_ZEXT]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8)
@@ -499,54 +493,53 @@ define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i8_signext_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i8_signext_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i8_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY21]], 8
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 8
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_SEXT]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s8)
@@ -561,54 +554,53 @@ define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i16_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i16_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i16_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -620,54 +612,53 @@ define amdgpu_kernel void @test_call_external_i16_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i16_zeroext_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i16_zeroext_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i16_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY21]], 16
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 16
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_ZEXT]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s16)
@@ -682,54 +673,53 @@ define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i16_signext_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i16_signext_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i16_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY21]], 16
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 16
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_SEXT]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
@@ -744,55 +734,54 @@ define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call i32 @external_i32_func_void()
   store volatile i32 %val, ptr addrspace(1) undef
@@ -820,55 +809,54 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i48_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i48_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i48_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i48_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32)
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s48), [[DEF]](p1) :: (volatile store (s48) into `ptr addrspace(1) undef`, align 8, addrspace 1)
@@ -881,55 +869,54 @@ define amdgpu_kernel void @test_call_external_i48_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i48_zeroext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i48_zeroext_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i48_zeroext_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i48_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32)
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s48)
@@ -944,55 +931,54 @@ define amdgpu_kernel void @test_call_external_i48_zeroext_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i48_signext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i48_signext_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i48_signext_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i48_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32)
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s48)
@@ -1007,55 +993,54 @@ define amdgpu_kernel void @test_call_external_i48_signext_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i64_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i64_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i64_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i64_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1067,55 +1052,54 @@ define amdgpu_kernel void @test_call_external_i64_func_void() #0 {
 define amdgpu_kernel void @test_call_external_p1_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_p1_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_p1_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_p1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[MV]](p1), [[DEF]](p1) :: (volatile store (p1) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1127,58 +1111,57 @@ define amdgpu_kernel void @test_call_external_p1_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v2p1_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v2p1_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2p1_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2p1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
-  ; GCN-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32)
+  ; GCN-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
   ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p1>), [[DEF]](p1) :: (volatile store (<2 x p1>) into `ptr addrspace(1) undef`, addrspace 1)
@@ -1191,55 +1174,54 @@ define amdgpu_kernel void @test_call_external_v2p1_func_void() #0 {
 define amdgpu_kernel void @test_call_external_p3_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_p3_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_p3_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_p3_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY21]](p3), [[DEF]](p3) :: (volatile store (p3) into `ptr addrspace(3) undef`, addrspace 3)
+  ; GCN-NEXT:   G_STORE [[COPY19]](p3), [[DEF]](p3) :: (volatile store (p3) into `ptr addrspace(3) undef`, addrspace 3)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call ptr addrspace(3) @external_p3_func_void()
   store volatile ptr addrspace(3) %val, ptr addrspace(3) undef
@@ -1249,55 +1231,54 @@ define amdgpu_kernel void @test_call_external_p3_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v2p3_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v2p3_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2p3_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2p3_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(p3) = COPY $vgpr1
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY21]](p3), [[COPY22]](p3)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(p3) = COPY $vgpr1
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY19]](p3), [[COPY20]](p3)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p3) :: (volatile store (<2 x p3>) into `ptr addrspace(3) undef`, addrspace 3)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1309,54 +1290,53 @@ define amdgpu_kernel void @test_call_external_v2p3_func_void() #0 {
 define amdgpu_kernel void @test_call_external_f16_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_f16_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_f16_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_f16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1368,55 +1348,54 @@ define amdgpu_kernel void @test_call_external_f16_func_void() #0 {
 define amdgpu_kernel void @test_call_external_f32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_f32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_f32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_f32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call float @external_f32_func_void()
   store volatile float %val, ptr addrspace(1) undef
@@ -1426,55 +1405,54 @@ define amdgpu_kernel void @test_call_external_f32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_f64_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_f64_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_f64_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_f64_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1486,58 +1464,57 @@ define amdgpu_kernel void @test_call_external_f64_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v2f64_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v2f64_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2f64_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2f64_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
-  ; GCN-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32)
+  ; GCN-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
   ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (volatile store (<2 x s64>) into `ptr addrspace(1) undef`, addrspace 1)
@@ -1550,55 +1527,54 @@ define amdgpu_kernel void @test_call_external_v2f64_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v2i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (volatile store (<2 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1610,56 +1586,55 @@ define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v3i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v3i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v3i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store (<3 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1671,57 +1646,56 @@ define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v4i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v4i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v4i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (volatile store (<4 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1733,58 +1707,57 @@ define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v5i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v5i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v5i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store (<5 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1796,61 +1769,60 @@ define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v8i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v8i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v8i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GCN-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GCN-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GCN-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GCN-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (volatile store (<8 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1862,69 +1834,68 @@ define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v16i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v16i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v16i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GCN-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GCN-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GCN-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GCN-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GCN-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr9
-  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr10
-  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr11
-  ; GCN-NEXT:   [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr12
-  ; GCN-NEXT:   [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13
-  ; GCN-NEXT:   [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14
-  ; GCN-NEXT:   [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GCN-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GCN-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GCN-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr9
+  ; GCN-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GCN-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr12
+  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr13
+  ; GCN-NEXT:   [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr14
+  ; GCN-NEXT:   [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr15
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (volatile store (<16 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -1936,85 +1907,84 @@ define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v32i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v32i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v32i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GCN-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GCN-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GCN-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GCN-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GCN-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr9
-  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr10
-  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr11
-  ; GCN-NEXT:   [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr12
-  ; GCN-NEXT:   [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13
-  ; GCN-NEXT:   [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14
-  ; GCN-NEXT:   [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15
-  ; GCN-NEXT:   [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr16
-  ; GCN-NEXT:   [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr17
-  ; GCN-NEXT:   [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr18
-  ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr19
-  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr20
-  ; GCN-NEXT:   [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr21
-  ; GCN-NEXT:   [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr22
-  ; GCN-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY $vgpr23
-  ; GCN-NEXT:   [[COPY45:%[0-9]+]]:_(s32) = COPY $vgpr24
-  ; GCN-NEXT:   [[COPY46:%[0-9]+]]:_(s32) = COPY $vgpr25
-  ; GCN-NEXT:   [[COPY47:%[0-9]+]]:_(s32) = COPY $vgpr26
-  ; GCN-NEXT:   [[COPY48:%[0-9]+]]:_(s32) = COPY $vgpr27
-  ; GCN-NEXT:   [[COPY49:%[0-9]+]]:_(s32) = COPY $vgpr28
-  ; GCN-NEXT:   [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr29
-  ; GCN-NEXT:   [[COPY51:%[0-9]+]]:_(s32) = COPY $vgpr30
-  ; GCN-NEXT:   [[COPY52:%[0-9]+]]:_(s32) = COPY $vgpr31
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32), [[COPY37]](s32), [[COPY38]](s32), [[COPY39]](s32), [[COPY40]](s32), [[COPY41]](s32), [[COPY42]](s32), [[COPY43]](s32), [[COPY44]](s32), [[COPY45]](s32), [[COPY46]](s32), [[COPY47]](s32), [[COPY48]](s32), [[COPY49]](s32), [[COPY50]](s32), [[COPY51]](s32), [[COPY52]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GCN-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GCN-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GCN-NEXT:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr9
+  ; GCN-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GCN-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr12
+  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr13
+  ; GCN-NEXT:   [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr14
+  ; GCN-NEXT:   [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr15
+  ; GCN-NEXT:   [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr16
+  ; GCN-NEXT:   [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr17
+  ; GCN-NEXT:   [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr18
+  ; GCN-NEXT:   [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr19
+  ; GCN-NEXT:   [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr20
+  ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr21
+  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr22
+  ; GCN-NEXT:   [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr23
+  ; GCN-NEXT:   [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr24
+  ; GCN-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY $vgpr25
+  ; GCN-NEXT:   [[COPY45:%[0-9]+]]:_(s32) = COPY $vgpr26
+  ; GCN-NEXT:   [[COPY46:%[0-9]+]]:_(s32) = COPY $vgpr27
+  ; GCN-NEXT:   [[COPY47:%[0-9]+]]:_(s32) = COPY $vgpr28
+  ; GCN-NEXT:   [[COPY48:%[0-9]+]]:_(s32) = COPY $vgpr29
+  ; GCN-NEXT:   [[COPY49:%[0-9]+]]:_(s32) = COPY $vgpr30
+  ; GCN-NEXT:   [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32), [[COPY37]](s32), [[COPY38]](s32), [[COPY39]](s32), [[COPY40]](s32), [[COPY41]](s32), [[COPY42]](s32), [[COPY43]](s32), [[COPY44]](s32), [[COPY45]](s32), [[COPY46]](s32), [[COPY47]](s32), [[COPY48]](s32), [[COPY49]](s32), [[COPY50]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -2026,55 +1996,54 @@ define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v2i16_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2i16_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2i16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY21]](<2 x s16>), [[DEF]](p1) :: (volatile store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY19]](<2 x s16>), [[DEF]](p1) :: (volatile store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call <2 x i16> @external_v2i16_func_void()
   store volatile <2 x i16> %val, ptr addrspace(1) undef
@@ -2084,55 +2053,54 @@ define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v3i16_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v3i16_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v3i16_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v3i16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+  ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
   ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2146,55 +2114,54 @@ define amdgpu_kernel void @test_call_external_v3i16_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v4i16_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v4i16_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v4i16_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v4i16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+  ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store (<4 x s16>) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -2206,55 +2173,54 @@ define amdgpu_kernel void @test_call_external_v4i16_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v2f16_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2f16_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2f16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY21]](<2 x s16>), [[DEF]](p1) :: (volatile store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY19]](<2 x s16>), [[DEF]](p1) :: (volatile store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call <2 x half> @external_v2f16_func_void()
   store volatile <2 x half> %val, ptr addrspace(1) undef
@@ -2264,55 +2230,54 @@ define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v3f16_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v3f16_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v3f16_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v3f16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+  ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
   ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2326,55 +2291,54 @@ define amdgpu_kernel void @test_call_external_v3f16_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v4f16_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v4f16_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v4f16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+  ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store (<4 x s16>) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -2386,56 +2350,55 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v3f32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v3f32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v3f32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store (<3 x s32>) into `ptr addrspace(1) undef`, align 16, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -2447,58 +2410,57 @@ define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v5f32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v5f32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v5f32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store (<5 x s32>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -2511,58 +2473,57 @@ define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i32_i64_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i32_i64_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i32_i64_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call { i32, i64 } @external_i32_i64_func_void()
@@ -2601,57 +2562,56 @@ define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
 define amdgpu_kernel void @test_call_external_a2i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_a2i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_a2i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_a2i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   G_STORE [[COPY22]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[COPY20]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %val = call [2 x i32] @external_a2i32_func_void()
   %val.0 = extractvalue [2 x i32] %val, 0
@@ -2664,66 +2624,65 @@ define amdgpu_kernel void @test_call_external_a2i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_a5i8_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_a5i8_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_a5i8_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_a5i8_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4
-  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32)
   ; GCN-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
-  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY22]](s32)
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY20]](s32)
   ; GCN-NEXT:   [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC2]](s16)
-  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY23]](s32)
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
   ; GCN-NEXT:   [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC4]](s16)
-  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN-NEXT:   [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY24]](s32)
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN-NEXT:   [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY22]](s32)
   ; GCN-NEXT:   [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC6]](s16)
-  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GCN-NEXT:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY25]](s32)
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN-NEXT:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY23]](s32)
   ; GCN-NEXT:   [[TRUNC9:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC8]](s16)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
@@ -2749,52 +2708,51 @@ define amdgpu_kernel void @test_call_external_a5i8_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v32i32_i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v32i32_i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v32i32_i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GCN-NEXT:   $vgpr0 = COPY [[FRAME_INDEX]](p5)
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v32i32_i32_func_void, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2816,52 +2774,51 @@ define amdgpu_kernel void @test_call_external_v32i32_i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_i32_v32i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i32_v32i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i32_v32i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GCN-NEXT:   $vgpr0 = COPY [[FRAME_INDEX]](p5)
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i32_v32i32_func_void, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2883,52 +2840,51 @@ define amdgpu_kernel void @test_call_external_i32_v32i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v33i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_v33i32_func_void
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v33i32_func_void
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GCN-NEXT:   $vgpr0 = COPY [[FRAME_INDEX]](p5)
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v33i32_func_void, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2943,18 +2899,17 @@ define amdgpu_kernel void @test_call_external_v33i32_func_void() #0 {
 define amdgpu_kernel void @test_call_external_v33i32_func_v33i32_i32(ptr addrspace(1) %p, i32 %idx) #0 {
   ; GCN-LABEL: name: test_call_external_v33i32_func_v33i32_i32
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p1) from %ir.p.kernarg.offset1, align 16, addrspace 4)
@@ -2964,40 +2919,40 @@ define amdgpu_kernel void @test_call_external_v33i32_func_v33i32_i32(ptr addrspa
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v33i32_func_v33i32_i32
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-  ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p1)
   ; GCN-NEXT:   $vgpr0 = COPY [[FRAME_INDEX]](p5)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr2 = COPY [[UV1]](s32)
   ; GCN-NEXT:   $vgpr3 = COPY [[LOAD1]](s32)
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v33i32_func_v33i32_i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3012,3 +2967,6 @@ define amdgpu_kernel void @test_call_external_v33i32_func_v33i32_i32(ptr addrspa
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind noinline }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
index 820e41de575e5..ed68d82997f54 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
@@ -6,18 +6,17 @@ declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(p
 define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
   ; GCN-LABEL: name: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32
   ; GCN: bb.1 (%ir-block.1):
-  ; GCN-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -30,24 +29,24 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
   ; GCN-NEXT:   G_STORE [[C1]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.in.gep1, addrspace 5)
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; GCN-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-  ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64)
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32)
-  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32)
+  ; GCN-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
+  ; GCN-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32)
   ; GCN-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GCN-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
   ; GCN-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -55,16 +54,16 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
   ; GCN-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
   ; GCN-NEXT:   G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store (s64) into stack, align 4, addrspace 5), (dereferenceable load (s64) from %ir.in.val, align 4, addrspace 5)
   ; GCN-NEXT:   $vgpr0 = COPY [[FRAME_INDEX1]](p5)
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; GCN-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 8, implicit-def $scc
@@ -89,3 +88,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
   store volatile i32 %out.val1, ptr addrspace(1) undef
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 2ede504223cb8..e29d1781d3dff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -97,49 +97,48 @@ declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8,
 define amdgpu_kernel void @test_call_external_void_func_void() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_void
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -174,12 +173,12 @@ define void @test_func_call_external_void_func_void() #0 {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_void
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -208,51 +207,50 @@ define void @test_func_call_external_void_func_void() #0 {
 define amdgpu_kernel void @test_call_external_void_func_empty_struct() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_empty_struct
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_empty_struct
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_empty_struct, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -264,51 +262,50 @@ define amdgpu_kernel void @test_call_external_void_func_empty_struct() #0 {
 define amdgpu_kernel void @test_call_external_void_func_empty_array() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_empty_array
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_empty_array
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_empty_array, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -320,52 +317,51 @@ define amdgpu_kernel void @test_call_external_void_func_empty_array() #0 {
 define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i1_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s1)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -377,54 +373,53 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i1_signext
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s1) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1_signext
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
   ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -437,54 +432,53 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i1_zeroext
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s1) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1_zeroext
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -497,54 +491,53 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i8_imm
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 123
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[C]](s8)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i8, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -556,55 +549,54 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i8_signext
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s8) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8_signext
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[LOAD]](s8)
   ; CHECK-NEXT:   [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[SEXT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i8_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -617,55 +609,54 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i8_zeroext
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s8) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8_zeroext
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[LOAD]](s8)
   ; CHECK-NEXT:   [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ZEXT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i8_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -678,52 +669,51 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i16_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 123
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -735,54 +725,53 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i16_signext
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s16) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_signext
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -795,54 +784,53 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i16_zeroext
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s16) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_zeroext
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -855,52 +843,51 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i32_imm
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -950,53 +937,52 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
 define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i64_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 123
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i64
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1008,56 +994,55 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `ptr addrspace(1) null`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i64
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1070,57 +1055,56 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2i64_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 17179869187
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i64
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1132,56 +1116,55 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_i48(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i48
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s48)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i48, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1194,56 +1177,55 @@ define amdgpu_kernel void @test_call_external_void_func_i48(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i48_signext(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i48_signext
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48_signext
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s48)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i48_signext, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1256,56 +1238,55 @@ define amdgpu_kernel void @test_call_external_void_func_i48_signext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i48_zeroext(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i48_zeroext
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48_zeroext
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s48)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i48_zeroext, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1318,54 +1299,53 @@ define amdgpu_kernel void @test_call_external_void_func_i48_zeroext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_p0_imm(ptr %arg) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_p0_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.arg.kernarg.offset1, align 16, addrspace 4)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_p0
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p0)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1377,56 +1357,55 @@ define amdgpu_kernel void @test_call_external_void_func_p0_imm(ptr %arg) #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2p0() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2p0
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x p0>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x p0>) from `ptr addrspace(1) null`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2p0
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x p0>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1439,18 +1418,17 @@ define amdgpu_kernel void @test_call_external_void_func_v2p0() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v3i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
@@ -1459,24 +1437,24 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<3 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i64
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHUF]](<3 x s64>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -1485,16 +1463,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
   ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1509,18 +1487,17 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v4i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 17179869187
@@ -1529,24 +1506,24 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2, 3)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i64
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHUF]](<4 x s64>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -1557,16 +1534,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
   ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
   ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
   ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1580,52 +1557,51 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_f16_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4400
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1637,51 +1613,50 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_f32_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1693,55 +1668,54 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2f32_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1753,57 +1727,56 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v3f32_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1815,18 +1788,17 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v5f32_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00
@@ -1835,24 +1807,24 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v5f32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C5]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C5]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C6]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C6]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C7]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C7]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<5 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -1860,16 +1832,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v5f32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1881,53 +1853,52 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_f64_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.000000e+00
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f64
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -1939,57 +1910,56 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2f64_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.000000e+00
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f64
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2001,42 +1971,41 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v3f64_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.000000e+00
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_FCONSTANT double 8.000000e+00
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64), [[C2]](s64)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f64
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s64>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -2045,16 +2014,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
   ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2066,52 +2035,51 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s16>) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[LOAD]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i16, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2124,57 +2092,56 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v3i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<3 x s16>) from `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<3 x s16>)
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[DEF2]](s16)
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[DEF3]](s16)
   ; CHECK-NEXT:   [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s16>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV3]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV4]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2187,57 +2154,56 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v3f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<3 x s16>) from `ptr addrspace(1) undef`, align 8, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<3 x s16>)
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[DEF2]](s16)
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[DEF3]](s16)
   ; CHECK-NEXT:   [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s16>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV3]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV4]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2250,54 +2216,53 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v4i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<4 x s16>) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2310,18 +2275,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v4i16_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
@@ -2329,38 +2293,38 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C4]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C4]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C5]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C5]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C6]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C6]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s16>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2372,58 +2336,57 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v5i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v5i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<5 x s16>) from `ptr addrspace(1) undef`, align 16, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v5i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<5 x s16>)
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[DEF2]](s16)
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[DEF3]](s16)
   ; CHECK-NEXT:   [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<6 x s16>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV5]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV6]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV7]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v5i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2436,59 +2399,58 @@ define amdgpu_kernel void @test_call_external_void_func_v5i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v7i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v7i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<7 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<7 x s16>) from `ptr addrspace(1) undef`, align 16, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v7i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<7 x s16>)
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF2]](s16)
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF3]](s16)
   ; CHECK-NEXT:   [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s16>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV7]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV8]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV9]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV10]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v7i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2501,44 +2463,43 @@ define amdgpu_kernel void @test_call_external_void_func_v7i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v63i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v63i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<63 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<63 x s16>) from `ptr addrspace(1) undef`, align 128, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v63i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16), [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16), [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<63 x s16>)
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<64 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[DEF2]](s16)
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<64 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[DEF3]](s16)
   ; CHECK-NEXT:   [[UV63:%[0-9]+]]:_(<2 x s16>), [[UV64:%[0-9]+]]:_(<2 x s16>), [[UV65:%[0-9]+]]:_(<2 x s16>), [[UV66:%[0-9]+]]:_(<2 x s16>), [[UV67:%[0-9]+]]:_(<2 x s16>), [[UV68:%[0-9]+]]:_(<2 x s16>), [[UV69:%[0-9]+]]:_(<2 x s16>), [[UV70:%[0-9]+]]:_(<2 x s16>), [[UV71:%[0-9]+]]:_(<2 x s16>), [[UV72:%[0-9]+]]:_(<2 x s16>), [[UV73:%[0-9]+]]:_(<2 x s16>), [[UV74:%[0-9]+]]:_(<2 x s16>), [[UV75:%[0-9]+]]:_(<2 x s16>), [[UV76:%[0-9]+]]:_(<2 x s16>), [[UV77:%[0-9]+]]:_(<2 x s16>), [[UV78:%[0-9]+]]:_(<2 x s16>), [[UV79:%[0-9]+]]:_(<2 x s16>), [[UV80:%[0-9]+]]:_(<2 x s16>), [[UV81:%[0-9]+]]:_(<2 x s16>), [[UV82:%[0-9]+]]:_(<2 x s16>), [[UV83:%[0-9]+]]:_(<2 x s16>), [[UV84:%[0-9]+]]:_(<2 x s16>), [[UV85:%[0-9]+]]:_(<2 x s16>), [[UV86:%[0-9]+]]:_(<2 x s16>), [[UV87:%[0-9]+]]:_(<2 x s16>), [[UV88:%[0-9]+]]:_(<2 x s16>), [[UV89:%[0-9]+]]:_(<2 x s16>), [[UV90:%[0-9]+]]:_(<2 x s16>), [[UV91:%[0-9]+]]:_(<2 x s16>), [[UV92:%[0-9]+]]:_(<2 x s16>), [[UV93:%[0-9]+]]:_(<2 x s16>), [[UV94:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<64 x s16>)
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -2575,16 +2536,16 @@ define amdgpu_kernel void @test_call_external_void_func_v63i16() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV91]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV92]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV93]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v63i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 4, implicit-def $scc
@@ -2597,44 +2558,43 @@ define amdgpu_kernel void @test_call_external_void_func_v63i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v65i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v65i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<65 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<65 x s16>) from `ptr addrspace(1) undef`, align 256, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v65i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16), [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16), [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16), [[UV63:%[0-9]+]]:_(s16), [[UV64:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<65 x s16>)
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<66 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[UV63]](s16), [[UV64]](s16), [[DEF2]](s16)
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<66 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[UV63]](s16), [[UV64]](s16), [[DEF3]](s16)
   ; CHECK-NEXT:   [[UV65:%[0-9]+]]:_(<2 x s16>), [[UV66:%[0-9]+]]:_(<2 x s16>), [[UV67:%[0-9]+]]:_(<2 x s16>), [[UV68:%[0-9]+]]:_(<2 x s16>), [[UV69:%[0-9]+]]:_(<2 x s16>), [[UV70:%[0-9]+]]:_(<2 x s16>), [[UV71:%[0-9]+]]:_(<2 x s16>), [[UV72:%[0-9]+]]:_(<2 x s16>), [[UV73:%[0-9]+]]:_(<2 x s16>), [[UV74:%[0-9]+]]:_(<2 x s16>), [[UV75:%[0-9]+]]:_(<2 x s16>), [[UV76:%[0-9]+]]:_(<2 x s16>), [[UV77:%[0-9]+]]:_(<2 x s16>), [[UV78:%[0-9]+]]:_(<2 x s16>), [[UV79:%[0-9]+]]:_(<2 x s16>), [[UV80:%[0-9]+]]:_(<2 x s16>), [[UV81:%[0-9]+]]:_(<2 x s16>), [[UV82:%[0-9]+]]:_(<2 x s16>), [[UV83:%[0-9]+]]:_(<2 x s16>), [[UV84:%[0-9]+]]:_(<2 x s16>), [[UV85:%[0-9]+]]:_(<2 x s16>), [[UV86:%[0-9]+]]:_(<2 x s16>), [[UV87:%[0-9]+]]:_(<2 x s16>), [[UV88:%[0-9]+]]:_(<2 x s16>), [[UV89:%[0-9]+]]:_(<2 x s16>), [[UV90:%[0-9]+]]:_(<2 x s16>), [[UV91:%[0-9]+]]:_(<2 x s16>), [[UV92:%[0-9]+]]:_(<2 x s16>), [[UV93:%[0-9]+]]:_(<2 x s16>), [[UV94:%[0-9]+]]:_(<2 x s16>), [[UV95:%[0-9]+]]:_(<2 x s16>), [[UV96:%[0-9]+]]:_(<2 x s16>), [[UV97:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<66 x s16>)
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -2674,16 +2634,16 @@ define amdgpu_kernel void @test_call_external_void_func_v65i16() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV93]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV94]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV95]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v65i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 8, implicit-def $scc
@@ -2696,40 +2656,39 @@ define amdgpu_kernel void @test_call_external_void_func_v65i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v66i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v66i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<66 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<66 x s16>) from `ptr addrspace(1) undef`, align 256, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v66i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>), [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>), [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>), [[UV24:%[0-9]+]]:_(<2 x s16>), [[UV25:%[0-9]+]]:_(<2 x s16>), [[UV26:%[0-9]+]]:_(<2 x s16>), [[UV27:%[0-9]+]]:_(<2 x s16>), [[UV28:%[0-9]+]]:_(<2 x s16>), [[UV29:%[0-9]+]]:_(<2 x s16>), [[UV30:%[0-9]+]]:_(<2 x s16>), [[UV31:%[0-9]+]]:_(<2 x s16>), [[UV32:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<66 x s16>)
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
@@ -2770,16 +2729,16 @@ define amdgpu_kernel void @test_call_external_void_func_v66i16() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v66i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 8, implicit-def $scc
@@ -2792,52 +2751,51 @@ define amdgpu_kernel void @test_call_external_void_func_v66i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s16>) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[LOAD]](<2 x s16>)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2850,54 +2808,53 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s32>) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2910,55 +2867,54 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2i32_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -2970,18 +2926,17 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v3i32_imm
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
@@ -2989,39 +2944,39 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3033,18 +2988,17 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v3i32_i32
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
@@ -3053,40 +3007,40 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i32_i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C4]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C4]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C5]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C5]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C6]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C6]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[C3]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i32_i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3098,56 +3052,55 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v4i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<4 x s32>) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3160,18 +3113,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v4i32_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
@@ -3179,40 +3131,40 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C4]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C4]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C5]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C5]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C6]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C6]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3224,18 +3176,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v5i32_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
@@ -3244,24 +3195,24 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v5i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C5]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C5]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C6]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C6]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C7]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C7]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<5 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -3269,16 +3220,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v5i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3290,41 +3241,40 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v8i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<8 x s32>) from %ir.ptr, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v8i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<8 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -3335,16 +3285,16 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
   ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
   ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
   ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v8i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3358,18 +3308,17 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v8i32_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
@@ -3381,24 +3330,24 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32), [[C5]](s32), [[C6]](s32), [[C7]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v8i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C8]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C8]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C9]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C9]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C10]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C10]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -3409,16 +3358,16 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
   ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
   ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
   ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v8i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3430,41 +3379,40 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v16i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<16 x s32>) from %ir.ptr, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v16i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -3483,16 +3431,16 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
   ; CHECK-NEXT:   $vgpr13 = COPY [[UV13]](s32)
   ; CHECK-NEXT:   $vgpr14 = COPY [[UV14]](s32)
   ; CHECK-NEXT:   $vgpr15 = COPY [[UV15]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v16i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -3506,41 +3454,40 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v32i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
@@ -3578,16 +3525,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 4, implicit-def $scc
@@ -3601,18 +3548,17 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v32i32_i32
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
@@ -3621,24 +3567,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
   ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
@@ -3679,16 +3625,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF2]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF3]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32_i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 8, implicit-def $scc
@@ -3703,18 +3649,17 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v32i32_i8_i8_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
@@ -3723,24 +3668,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
   ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s16) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i8_i8_i16
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
@@ -3751,10 +3696,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
   ; CHECK-NEXT:   G_STORE [[ANYEXT]](s16), [[PTR_ADD2]](p5) :: (store (s16) into stack + 4, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(s16) = COPY [[ANYEXT]](s16)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s16) = COPY [[ANYEXT]](s16)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
   ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
-  ; CHECK-NEXT:   G_STORE [[COPY20]](s16), [[PTR_ADD3]](p5) :: (store (s16) into stack + 8, align 8, addrspace 5)
+  ; CHECK-NEXT:   G_STORE [[COPY18]](s16), [[PTR_ADD3]](p5) :: (store (s16) into stack + 8, align 8, addrspace 5)
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32)
   ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[PTR_ADD4]](p5) :: (store (s16) into stack + 12, align 4, addrspace 5)
@@ -3789,16 +3734,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF2]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF3]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32_i8_i8_i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 16, implicit-def $scc
@@ -3815,18 +3760,17 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v32i32_p3_p5
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
@@ -3835,24 +3779,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
   ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
@@ -3896,16 +3840,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF2]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF3]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32_p3_p5, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 12, implicit-def $scc
@@ -3921,18 +3865,17 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (s8) from %ir.ptr0, align 4, addrspace 1)
@@ -3941,39 +3884,39 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: ("amdgpu-noclobber" load (s32) from %ir.ptr0 + 4, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_struct_i8_i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT1]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[LOAD2]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_struct_i8_i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -4039,18 +3982,17 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
 define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_byval_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.val
@@ -4060,40 +4002,40 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
   ; CHECK-NEXT:   G_STORE [[C1]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.gep1, addrspace 5)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_byval_struct_i8_i32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32)
   ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
   ; CHECK-NEXT:   G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store (s64) into stack, align 4, addrspace 5), (dereferenceable load (s64) from %ir.val, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_byval_struct_i8_i32, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 8, implicit-def $scc
@@ -4121,7 +4063,7 @@ define void @call_byval_3ai32_byval_i8_align32(ptr addrspace(5) %incoming0, ptr
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p5) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p5) = COPY $vgpr1
@@ -4129,7 +4071,7 @@ define void @call_byval_3ai32_byval_i8_align32(ptr addrspace(5) %incoming0, ptr
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_byval_a3i32_byval_i8_align32
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -4181,13 +4123,13 @@ define void @call_byval_a4i64_align4_higher_source_align(ptr addrspace(5) align
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p5) = COPY $vgpr0
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_byval_a4i64_align4
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -4221,41 +4163,40 @@ define void @call_byval_a4i64_align4_higher_source_align(ptr addrspace(5) align
 define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v2i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<2 x s8>) from %ir.ptr, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i8
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<2 x s8>)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
@@ -4264,16 +4205,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 {
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT2]](s32)
   ; CHECK-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
   ; CHECK-NEXT:   $vgpr1 = COPY [[ANYEXT3]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -4287,41 +4228,40 @@ define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3i8() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v3i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<3 x s8>) from %ir.ptr, align 4, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i8
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<3 x s8>)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
@@ -4333,16 +4273,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i8() #0 {
   ; CHECK-NEXT:   $vgpr1 = COPY [[ANYEXT4]](s32)
   ; CHECK-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT2]](s16)
   ; CHECK-NEXT:   $vgpr2 = COPY [[ANYEXT5]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -4356,41 +4296,40 @@ define amdgpu_kernel void @test_call_external_void_func_v3i8() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v4i8() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v4i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<4 x s8>) from %ir.ptr, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i8
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<4 x s8>)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
@@ -4405,16 +4344,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i8() #0 {
   ; CHECK-NEXT:   $vgpr2 = COPY [[ANYEXT6]](s32)
   ; CHECK-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT3]](s16)
   ; CHECK-NEXT:   $vgpr3 = COPY [[ANYEXT7]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -4428,41 +4367,40 @@ define amdgpu_kernel void @test_call_external_void_func_v4i8() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v8i8() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v8i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<8 x s8>) from %ir.ptr, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v8i8
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<8 x s8>)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
@@ -4489,16 +4427,16 @@ define amdgpu_kernel void @test_call_external_void_func_v8i8() #0 {
   ; CHECK-NEXT:   $vgpr6 = COPY [[ANYEXT14]](s32)
   ; CHECK-NEXT:   [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT7]](s16)
   ; CHECK-NEXT:   $vgpr7 = COPY [[ANYEXT15]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v8i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -4512,41 +4450,40 @@ define amdgpu_kernel void @test_call_external_void_func_v8i8() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_v16i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<16 x s8>) from %ir.ptr, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v16i8
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<16 x s8>)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
@@ -4597,16 +4534,16 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
   ; CHECK-NEXT:   $vgpr14 = COPY [[ANYEXT30]](s32)
   ; CHECK-NEXT:   [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT15]](s16)
   ; CHECK-NEXT:   $vgpr15 = COPY [[ANYEXT31]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF1]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF2]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v16i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -4620,18 +4557,17 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
   ; CHECK-LABEL: name: stack_passed_arg_alignment_v32i32_f64
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (<32 x s32>) from %ir.val.kernarg.offset1, align 16, addrspace 4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
@@ -4639,24 +4575,24 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s64) from %ir.tmp.kernarg.offset, align 16, addrspace 4)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @stack_passed_f64_arg
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 136
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<32 x s32>)
   ; CHECK-NEXT:   [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
@@ -4701,16 +4637,16 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @stack_passed_f64_arg, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 12, implicit-def $scc
@@ -4732,7 +4668,7 @@ define void @stack_12xv3i32() #0 {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32)
@@ -4765,7 +4701,7 @@ define void @stack_12xv3i32() #0 {
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_12xv3i32
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -4875,7 +4811,7 @@ define void @stack_12xv3f32() #0 {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32)
@@ -4908,7 +4844,7 @@ define void @stack_12xv3f32() #0 {
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_12xv3f32
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -5018,7 +4954,7 @@ define void @stack_8xv5i32() #0 {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
@@ -5047,7 +4983,7 @@ define void @stack_8xv5i32() #0 {
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_8xv5i32
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -5161,7 +5097,7 @@ define void @stack_8xv5f32() #0 {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
@@ -5190,7 +5126,7 @@ define void @stack_8xv5f32() #0 {
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_8xv5f32
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -5330,3 +5266,6 @@ main_body:
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind noinline }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll
index 21e280e9ba559..ab407079abc66 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll
@@ -7,9 +7,9 @@
 define amdgpu_kernel void @constant_fold_vector_add() {
   ; CHECK-LABEL: name: constant_fold_vector_add
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64), [[C]](s64), [[C]](s64)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
@@ -22,3 +22,6 @@ entry:
   store <4 x i64> %add, ptr addrspace(1) null, align 32
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
index 4eba84f61c2d8..0fb13bf9c8a2b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@@ -4,50 +4,49 @@
 define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
   ; CHECK-LABEL: name: test_indirect_call_sgpr_ptr
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.fptr.kernarg.offset1, align 16, addrspace 4)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]](p4)
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[OR1]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -73,3 +72,6 @@ define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(ptr %fptr) {
   call amdgpu_gfx void %fptr()
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index bb37e54e3b566..ceff84ea18122 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -4,10 +4,10 @@
 define amdgpu_kernel void @asm_convergent() convergent{
   ; CHECK-LABEL: name: asm_convergent
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   INLINEASM &s_barrier, 33 /* sideeffect isconvergent attdialect */, !0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   INLINEASM &s_barrier, 33 /* sideeffect isconvergent attdialect */, !1
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "s_barrier", ""() convergent, !srcloc !0
   ret void
@@ -16,11 +16,11 @@ define amdgpu_kernel void @asm_convergent() convergent{
 define amdgpu_kernel void @asm_simple_memory_clobber() {
   ; CHECK-LABEL: name: asm_simple_memory_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, !0
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, !0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, !1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, !1
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "", "~{memory}"(), !srcloc !0
   call void asm sideeffect "", ""(), !srcloc !0
@@ -30,10 +30,10 @@ define amdgpu_kernel void @asm_simple_memory_clobber() {
 define amdgpu_kernel void @asm_simple_vgpr_clobber() {
   ; CHECK-LABEL: name: asm_simple_vgpr_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0, !0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0, !1
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"(), !srcloc !0
   ret void
@@ -42,10 +42,10 @@ define amdgpu_kernel void @asm_simple_vgpr_clobber() {
 define amdgpu_kernel void @asm_simple_sgpr_clobber() {
   ; CHECK-LABEL: name: asm_simple_sgpr_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $sgpr0, !0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $sgpr0, !1
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "s_mov_b32 s0, 7", "~{s0}"(), !srcloc !0
   ret void
@@ -54,10 +54,10 @@ define amdgpu_kernel void @asm_simple_sgpr_clobber() {
 define amdgpu_kernel void @asm_simple_agpr_clobber() {
   ; CHECK-LABEL: name: asm_simple_agpr_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   INLINEASM &"; def a0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $agpr0, !0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   INLINEASM &"; def a0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $agpr0, !1
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "; def a0", "~{a0}"(), !srcloc !0
   ret void
@@ -66,9 +66,9 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() {
 define i32 @asm_vgpr_early_clobber() {
   ; CHECK-LABEL: name: asm_vgpr_early_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %7, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, !1
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %7
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[ADD]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -94,8 +94,8 @@ entry:
 define i32 @test_single_vgpr_output() nounwind {
   ; CHECK-LABEL: name: test_single_vgpr_output
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %7
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %7
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
 entry:
@@ -106,8 +106,8 @@ entry:
 define i32 @test_single_sgpr_output_s32() nounwind {
   ; CHECK-LABEL: name: test_single_sgpr_output_s32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %7
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
 entry:
@@ -119,9 +119,9 @@ entry:
 define float @test_multiple_register_outputs_same() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_same
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 2228234 /* regdef:VGPR_32 */, def %9
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %7, 2228234 /* regdef:VGPR_32 */, def %8
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %7
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[FADD]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -136,9 +136,9 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 3538954 /* regdef:VReg_64 */, def %9
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %7, 3538954 /* regdef:VReg_64 */, def %8
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %7
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %8
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -166,9 +166,9 @@ define float @test_vector_output() nounwind {
 define amdgpu_kernel void @test_input_vgpr_imm() {
   ; CHECK-LABEL: name: test_input_vgpr_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32)
   ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY1]]
@@ -180,9 +180,9 @@ define amdgpu_kernel void @test_input_vgpr_imm() {
 define amdgpu_kernel void @test_input_sgpr_imm() {
   ; CHECK-LABEL: name: test_input_sgpr_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32)
   ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[COPY1]]
@@ -194,9 +194,9 @@ define amdgpu_kernel void @test_input_sgpr_imm() {
 define amdgpu_kernel void @test_input_imm() {
   ; CHECK-LABEL: name: test_input_imm
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, $0", 9 /* sideeffect mayload attdialect */, 13 /* imm */, 42
   ; CHECK-NEXT:   INLINEASM &"s_mov_b64 s[0:1], $0", 9 /* sideeffect mayload attdialect */, 13 /* imm */, 42
   ; CHECK-NEXT:   S_ENDPGM 0
@@ -212,8 +212,8 @@ define float @test_input_vgpr(i32 %src) nounwind {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 2228233 /* reguse:VGPR_32 */, [[COPY1]]
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %9
+  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 2228233 /* reguse:VGPR_32 */, [[COPY1]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
 entry:
@@ -227,8 +227,8 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind {
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
+  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 262158 /* mem:m */, [[COPY]](p3)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %1 = tail call i32 asm "ds_read_b32 $0, $1", "=v,*m"(ptr addrspace(3) elementtype(i32) %a)
@@ -244,8 +244,8 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
-  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %11
+  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %10
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %and = and i32 %a, 1
@@ -256,14 +256,14 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
 define i32 @test_sgpr_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %10
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %10
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %7
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %9
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %12, 2359305 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY %12
+  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %11, 2359305 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY %11
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY4]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
 entry:
@@ -285,10 +285,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2228234 /* regdef:VGPR_32 */, def %12, 2228234 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY %11
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY %12
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY %13
+  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2228234 /* regdef:VGPR_32 */, def %11, 2228234 /* regdef:VGPR_32 */, def %12, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY %10
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY %11
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY %12
   ; CHECK-NEXT:   G_STORE [[COPY6]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
@@ -306,11 +306,11 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
 define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %7
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %10
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
 entry:
@@ -322,13 +322,15 @@ entry:
 define amdgpu_kernel void @asm_constraint_n_n()  {
   ; CHECK-LABEL: name: asm_constraint_n_n
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   INLINEASM &"s_trap ${0:n}", 1 /* sideeffect attdialect */, 13 /* imm */, 10
   ; CHECK-NEXT:   S_ENDPGM 0
   tail call void asm sideeffect "s_trap ${0:n}", "n"(i32 10) #1
   ret void
 }
 
+!llvm.module.flags = !{!1}
 !0 = !{i32 70}
+!1 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
index cc4796e534aba..b45b307f890d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
@@ -816,7 +816,7 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -933,7 +933,7 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
   ; GCN-NEXT:   G_STORE [[C1]](s64), [[PTR_ADD2]](p5) :: (store (s64) into %ir.alloca1 + 8, addrspace 5)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_multi_byval
   ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -984,7 +984,7 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1097,7 +1097,7 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
   ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca + 8, addrspace 5)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_byval_and_stack_passed
   ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -1176,14 +1176,14 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @i64_fastcc_i64
   ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -1225,14 +1225,14 @@ define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspa
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @p1i8_fastcc_p1i8
   ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -1274,13 +1274,13 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @i16_fastcc_i16
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -1321,13 +1321,13 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @f16_fastcc_f16
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -1368,7 +1368,7 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1377,7 +1377,7 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1
   ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v3i16_fastcc_v3i16
   ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -1422,14 +1422,14 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v4i16_fastcc_v4i16
   ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -1471,7 +1471,7 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1482,7 +1482,7 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1
   ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v2i64_fastcc_v2i64
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -1514,3 +1514,6 @@ entry:
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind noinline }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
index edcb4622d5d7e..1ead1e443dfe1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
@@ -15,11 +15,11 @@ define void @tail_call_void_func_void() {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @external_void_func_void
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@@ -42,3 +42,6 @@ define void @tail_call_void_func_void() {
   tail call void @external_void_func_void()
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 387c5c7078032..a0f54bfee2dca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -9,7 +9,8 @@
 define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; CI-LABEL: is_private_vgpr:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x32
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -18,9 +19,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    s_load_dword s0, s[4:5], 0x11
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v1
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v1
 ; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CI-NEXT:    flat_store_dword v[0:1], v0
 ; CI-NEXT:    s_endpgm
@@ -79,9 +78,9 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; CI-LABEL: is_private_sgpr:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s0, s[4:5], 0x11
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_lg_u32 s1, s0
 ; CI-NEXT:    s_cbranch_scc1 .LBB1_2
@@ -150,3 +149,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i1 @llvm.amdgcn.is.private(ptr nocapture) #0
 
 attributes #0 = { nounwind readnone speculatable }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index 75f1fea9e09da..4b3a475891031 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -9,7 +9,8 @@
 define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; CI-LABEL: is_local_vgpr:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x33
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -18,9 +19,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    s_load_dword s0, s[4:5], 0x10
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v1
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v1
 ; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CI-NEXT:    flat_store_dword v[0:1], v0
 ; CI-NEXT:    s_endpgm
@@ -79,9 +78,9 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; CI-LABEL: is_local_sgpr:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s0, s[4:5], 0x10
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x33
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_lg_u32 s1, s0
 ; CI-NEXT:    s_cbranch_scc1 .LBB1_2
@@ -150,3 +149,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #0
 
 attributes #0 = { nounwind readnone speculatable }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index fec564f9b45db..e3779ecb1da67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -75,8 +75,8 @@ bb.2:
   store volatile i32 0, ptr addrspace(1) undef
   ret void
 }
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
-; DEFAULTSIZE: ; ScratchSize: 4112
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16
+; DEFAULTSIZE: ; ScratchSize: 16
 
 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
 ; ASSUME1024: ; ScratchSize: 1040
@@ -137,8 +137,8 @@ bb.1:
   ret void
 }
 
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
-; DEFAULTSIZE: ; ScratchSize: 4160
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64
+; DEFAULTSIZE: ; ScratchSize: 64
 
 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
 ; ASSUME1024: ; ScratchSize: 1088
@@ -265,3 +265,9 @@ bb.1:
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone speculatable }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ASSUME1024: {{.*}}
+; DEFAULTSIZE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 089799383294e..b356b9af479ee 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -11,10 +11,10 @@ declare void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) nocapture, ptr addrspace(4)
 @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
 
 ;.
-; HSA: @[[LDS_I32:[a-zA-Z0-9_$"\\.-]+]] = unnamed_addr addrspace(3) global i32 undef, align 4
-; HSA: @[[LDS_ARR:[a-zA-Z0-9_$"\\.-]+]] = unnamed_addr addrspace(3) global [256 x i32] undef, align 4
-; HSA: @[[GLOBAL_I32:[a-zA-Z0-9_$"\\.-]+]] = unnamed_addr addrspace(1) global i32 undef, align 4
-; HSA: @[[GLOBAL_ARR:[a-zA-Z0-9_$"\\.-]+]] = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
+; HSA: @lds.i32 = unnamed_addr addrspace(3) global i32 undef, align 4
+; HSA: @lds.arr = unnamed_addr addrspace(3) global [256 x i32] undef, align 4
+; HSA: @global.i32 = unnamed_addr addrspace(1) global i32 undef, align 4
+; HSA: @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
 ;.
 define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
 ; HSA-LABEL: define {{[^@]+}}@store_cast_0_flat_to_group_addrspacecast
@@ -225,12 +225,19 @@ define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 {
 
 attributes #0 = { argmemonly nounwind }
 attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
 ;.
 ; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+;.
+; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 500}
+;.
+; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 500}
 ;.

From f90b6090048b277cad44dd49067c26deec3e110c Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Wed, 3 Jan 2024 13:33:18 +0100
Subject: [PATCH 125/313] [mlir] introduce transform.num_associations (#76723)

Add a new transform operation that creates a new parameter containing the number of payload objects (operations, values or attributes) associated with the argument. This is useful in matching and for debugging purposes. This replaces three ad-hoc operations previously provided by the test extension.
---
 .../mlir/Dialect/Transform/IR/TransformOps.td | 22 +++++
 .../lib/Dialect/Transform/IR/TransformOps.cpp | 37 +++++++
 .../transform-op-bufferize-to-allocation.mlir | 15 ++-
 .../Dialect/Linalg/transform-op-match.mlir    |  5 +-
 .../test/Dialect/Linalg/transform-op-pad.mlir |  3 +-
 mlir/test/Dialect/Transform/ops-invalid.mlir  |  8 ++
 .../Dialect/Transform/test-interpreter.mlir   | 96 ++++++++++++-------
 .../Transform/test-loop-transforms.mlir       |  9 +-
 .../TestTransformDialectExtension.cpp         | 45 ---------
 .../TestTransformDialectExtension.td          | 27 ------
 10 files changed, 152 insertions(+), 115 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 307257f4a582b..fcdb21d21503a 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -438,6 +438,28 @@ def CastOp : TransformDialectOp<"cast",
   }];
 }
 
+def NumAssociationsOp : TransformDialectOp<"num_associations",
+    [MemoryEffectsOpInterface, ParamProducerTransformOpTrait,
+     DeclareOpInterfaceMethods<TransformOpInterface>,
+     MatchOpInterface]> {
+  let summary =
+    "Returns the number of payload objects associated with the argument";
+  let description = [{
+    Given an argument, handle or parameter, returns a new parameter associated
+    with a single 64-bit number that corresponds to the number of payload
+    objects (operations or values for a handle, attributes for a parameter)
+    associated with the argument.
+
+    Always succeeds.
+  }];
+  let arguments = (ins Transform_AnyHandleOrParamType:$handle);
+  let results = (outs TransformParamTypeInterface:$num);
+  let assemblyFormat = [{
+    $handle attr-dict `:` functional-type(operands, results)
+  }];
+  let hasVerifier = 1;
+}
+
 def ForeachMatchOp : TransformDialectOp<"foreach_match", [
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
     DeclareOpInterfaceMethods<SymbolUserOpInterface>,
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 7136e423470a2..aa4694c88d3b2 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -32,6 +32,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
@@ -1974,6 +1975,42 @@ void transform::NamedSequenceOp::build(OpBuilder &builder,
                     /*extraBindingTypes=*/TypeRange(), bodyBuilder);
 }
 
+//===----------------------------------------------------------------------===//
+// NumAssociationsOp
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure
+transform::NumAssociationsOp::apply(transform::TransformRewriter &rewriter,
+                                    transform::TransformResults &results,
+                                    transform::TransformState &state) {
+  size_t numAssociations =
+      llvm::TypeSwitch<Type, size_t>(getHandle().getType())
+          .Case([&](TransformHandleTypeInterface opHandle) {
+            return llvm::range_size(state.getPayloadOps(getHandle()));
+          })
+          .Case([&](TransformValueHandleTypeInterface valueHandle) {
+            return llvm::range_size(state.getPayloadValues(getHandle()));
+          })
+          .Case([&](TransformParamTypeInterface param) {
+            return llvm::range_size(state.getParams(getHandle()));
+          })
+          .Default([](Type) {
+            llvm_unreachable("unknown kind of transform dialect type");
+            return 0;
+          });
+  results.setParams(getNum().cast<OpResult>(),
+                    rewriter.getI64IntegerAttr(numAssociations));
+  return DiagnosedSilenceableFailure::success();
+}
+
+LogicalResult transform::NumAssociationsOp::verify() {
+  // Verify that the result type accepts an i64 attribute as payload.
+  auto resultType = getNum().getType().cast<TransformParamTypeInterface>();
+  return resultType
+      .checkPayload(getLoc(), {Builder(getContext()).getI64IntegerAttr(0)})
+      .checkAndReport();
+}
+
 //===----------------------------------------------------------------------===//
 // SelectOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
index 49a52ba9e06f8..aa15ccf0beeee 100644
--- a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
@@ -36,13 +36,15 @@ module attributes {transform.with_named_sequence} {
 
     // Ensure that one linalg.fill was generated.
     %fill_op = transform.select "linalg.fill" in %new : (!transform.any_op) -> !transform.any_op
+    %p = transform.num_associations %fill_op : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below{{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %fill_op : !transform.any_op
+    transform.test_print_param %p : !transform.param<i64>
 
     // Ensure that one linalg.copy was generated.
     %mat = transform.select "bufferization.materialize_in_destination" in %new : (!transform.any_op) -> !transform.any_op
+    %p2 = transform.num_associations %mat : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below{{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %mat : !transform.any_op
+    transform.test_print_param %p2 : !transform.param<i64>
     transform.yield
   }
 }
@@ -73,18 +75,21 @@ module attributes {transform.with_named_sequence} {
 
     // Ensure that one linalg.fill was generated.
     %fill_op = transform.select "linalg.fill" in %new : (!transform.any_op) -> !transform.any_op
+    %p = transform.num_associations %fill_op : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below{{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %fill_op : !transform.any_op
+    transform.test_print_param %p : !transform.param<i64>
 
     // Ensure that one linalg.copy was generated.
     %linalg_copy = transform.select "linalg.copy" in %new : (!transform.any_op) -> !transform.any_op
+    %p2 = transform.num_associations %linalg_copy : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below{{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %linalg_copy : !transform.any_op
+    transform.test_print_param %p2 : !transform.param<i64>
 
     // Ensure that one memref.alloca was generated.
     %alloca = transform.select "memref.alloca" in %new : (!transform.any_op) -> !transform.any_op
+    %p3 = transform.num_associations %alloca : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below{{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %alloca : !transform.any_op
+    transform.test_print_param %p3 : !transform.param<i64>
 
     // Make sure that One-Shot Bufferize can bufferize the rest.
     %4 = transform.bufferization.one_shot_bufferize %arg1 : (!transform.any_op) -> !transform.any_op
diff --git a/mlir/test/Dialect/Linalg/transform-op-match.mlir b/mlir/test/Dialect/Linalg/transform-op-match.mlir
index 15942db9b5db2..db5b5f1c78677 100644
--- a/mlir/test/Dialect/Linalg/transform-op-match.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-match.mlir
@@ -134,8 +134,9 @@ module attributes {transform.with_named_sequence} {
           #linalg.iterator_type<parallel>,
           #linalg.iterator_type<reduction>]}
         in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-remark @below {{0}}
-    transform.test_print_number_of_associated_payload_ir_ops %no_match : !transform.any_op
+    %p = transform.num_associations %no_match : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{0}}
+    transform.test_print_param %p : !transform.param<i64>
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
index 6bca6c1fd6bf1..1f9d81a819e7f 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
@@ -41,8 +41,9 @@ module attributes {transform.with_named_sequence} {
       padding_dimensions=[0, 1, 2],
       pack_paddings=[1, 1, 0]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.op<"bufferization.materialize_in_destination">)
+    %p = transform.num_associations %copy_back : (!transform.op<"bufferization.materialize_in_destination">) -> !transform.param<i64>
     // expected-remark @below {{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %copy_back : !transform.op<"bufferization.materialize_in_destination">
+    transform.test_print_param %p : !transform.param<i64>
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/ops-invalid.mlir b/mlir/test/Dialect/Transform/ops-invalid.mlir
index 0964161588798..5123958b02bfb 100644
--- a/mlir/test/Dialect/Transform/ops-invalid.mlir
+++ b/mlir/test/Dialect/Transform/ops-invalid.mlir
@@ -696,3 +696,11 @@ transform.sequence failures(propagate) {
     transform.named_sequence @foo()
   } : !transform.any_op
 }
+
+// -----
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !transform.any_op):
+  // expected-error @below {{expected the type of the parameter attribute ('i64') to match the parameter type ('i32')}}
+  transform.num_associations %arg0 : (!transform.any_op) -> !transform.param<i32>
+}
diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir
index d9a11994eb9d9..a39e6f94cb34f 100644
--- a/mlir/test/Dialect/Transform/test-interpreter.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter.mlir
@@ -575,8 +575,9 @@ transform.with_pdl_patterns {
     %0 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
     %1 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
     %2 = merge_handles deduplicate %0, %1 : !transform.any_op
+    %3 = num_associations %2 : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below {{1}}
-    test_print_number_of_associated_payload_ir_ops %2 : !transform.any_op
+    test_print_param %3 : !transform.param<i64>
   }
 }
 
@@ -676,11 +677,13 @@ module {
     ^bb0(%arg1: !transform.any_op):
       %0 = pdl_match @func in %arg1 : (!transform.any_op) -> !transform.any_op
       %1 = replicate num(%0) %arg1 : !transform.any_op, !transform.any_op
+      %p = num_associations %1 : (!transform.any_op) -> !transform.param<i64>
       // expected-remark @below {{2}}
-      test_print_number_of_associated_payload_ir_ops %1 : !transform.any_op
+      test_print_param %p : !transform.param<i64>
       %2 = replicate num(%0) %1 : !transform.any_op, !transform.any_op
+      %p2 = num_associations %2 : (!transform.any_op) -> !transform.param<i64>
       // expected-remark @below {{4}}
-      test_print_number_of_associated_payload_ir_ops %2 : !transform.any_op
+      test_print_param %p2 : !transform.param<i64>
     }
   }
 }
@@ -708,8 +711,9 @@ transform.with_pdl_patterns {
     %f = pdl_match @const in %arg1 : (!transform.any_op) -> !transform.any_op
     transform.foreach %f : !transform.any_op {
     ^bb2(%arg2: !transform.any_op):
+      %p = transform.num_associations %arg2 : (!transform.any_op) -> !transform.param<i64>
       // expected-remark @below {{1}}
-      transform.test_print_number_of_associated_payload_ir_ops %arg2 : !transform.any_op
+      transform.test_print_param %p : !transform.param<i64>
       transform.test_print_remark_at_operand %arg2, "transform applied" : !transform.any_op
     }
   }
@@ -780,8 +784,9 @@ transform.with_pdl_patterns {
       transform.yield %g : !transform.any_op
     }
 
+    %p = transform.num_associations %results : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below {{3}}
-    transform.test_print_number_of_associated_payload_ir_ops %results : !transform.any_op
+    transform.test_print_param %p : !transform.param<i64>
     transform.test_print_remark_at_operand %results, "transform applied" : !transform.any_op
   }
 }
@@ -877,8 +882,9 @@ transform.sequence failures(propagate) {
 ^bb1(%fun: !transform.any_op):
   %muli = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
   %h:2 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{1}}
-  transform.test_print_number_of_associated_payload_ir_ops %h#0 : !transform.any_op
+  transform.test_print_param %p : !transform.param<i64>
   %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
   // expected-error @below {{expected to contain 3 payload ops but it contains 2 payload ops}}
   %h_2:3 = split_handle %muli_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
@@ -896,13 +902,15 @@ transform.sequence failures(suppress) {
 ^bb1(%fun: !transform.any_op):
   %muli = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
   %h:2 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{1}}
-  transform.test_print_number_of_associated_payload_ir_ops %h#0 : !transform.any_op
+  transform.test_print_param %p : !transform.param<i64>
   %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
   // Silenceable failure and all handles are now empty.
   %h_2:3 = split_handle %muli_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %p2 = transform.num_associations %h_2#0 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{0}}
-  transform.test_print_number_of_associated_payload_ir_ops %h_2#0 : !transform.any_op
+  transform.test_print_param %p2 : !transform.param<i64>
 }
 
 // -----
@@ -918,12 +926,15 @@ transform.sequence failures(propagate) {
   %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
   // No error, last result handle is empty.
   %h:3 = split_handle %muli_2 {fail_on_payload_too_small = false} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{1}}
-  transform.test_print_number_of_associated_payload_ir_ops %h#0 : !transform.any_op
+  transform.test_print_param %p : !transform.param<i64>
+  %p2 = transform.num_associations %h#1 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{1}}
-  transform.test_print_number_of_associated_payload_ir_ops %h#1 : !transform.any_op
+  transform.test_print_param %p2 : !transform.param<i64>
+  %p3 = transform.num_associations %h#2 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{0}}
-  transform.test_print_number_of_associated_payload_ir_ops %h#2 : !transform.any_op
+  transform.test_print_param %p3 : !transform.param<i64>
 }
 
 // -----
@@ -940,10 +951,12 @@ transform.sequence failures(propagate) {
 ^bb1(%fun: !transform.any_op):
   %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
   %h:2 = split_handle %muli_2 {overflow_result = 0} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{3}}
-  transform.test_print_number_of_associated_payload_ir_ops %h#0 : !transform.any_op
+  transform.test_print_param %p : !transform.param<i64>
+  %p2 = transform.num_associations %h#1 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{1}}
-  transform.test_print_number_of_associated_payload_ir_ops %h#1 : !transform.any_op
+  transform.test_print_param %p2 : !transform.param<i64>
 }
 
 // -----
@@ -1668,8 +1681,9 @@ transform.sequence failures(propagate) {
   // expected-remark @below {{2 iterations}}
   transform.test_tracked_rewrite %0 : (!transform.any_op) -> ()
   // One replacement op (test.drop_mapping) is dropped from the mapping.
+  %p = num_associations %0 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below {{2}}
-  test_print_number_of_associated_payload_ir_ops %0 : !transform.any_op
+  test_print_param %p : !transform.param<i64>
 }
 
 // -----
@@ -1684,20 +1698,24 @@ module {
     %2 = transform.param.constant 1 -> !transform.param<i64>
     %3 = transform.param.constant 2 -> !transform.param<i64>
     %4 = transform.merge_handles %1, %2 { deduplicate } : !transform.param<i64>
+    %p = num_associations %4 : (!transform.param<i64>) -> !transform.param<i64>
     // expected-remark @below {{1}}
-    test_print_number_of_associated_payload_ir_params %4 : !transform.param<i64>
+    test_print_param %p : !transform.param<i64>
 
     %5 = transform.merge_handles %1, %1 { deduplicate } : !transform.param<i64>
+    %p2 = num_associations %5 : (!transform.param<i64>) -> !transform.param<i64>
     // expected-remark @below {{1}}
-    test_print_number_of_associated_payload_ir_params %5 : !transform.param<i64>
+    test_print_param %p2 : !transform.param<i64>
 
     %6 = transform.merge_handles %1, %3 { deduplicate } : !transform.param<i64>
+    %p3 = num_associations %6 : (!transform.param<i64>) -> !transform.param<i64>
     // expected-remark @below {{2}}
-    test_print_number_of_associated_payload_ir_params %6 : !transform.param<i64>
+    test_print_param %p3 : !transform.param<i64>
 
     %7 = transform.merge_handles %1, %1, %2, %3 : !transform.param<i64>
+    %p4 = num_associations %7 : (!transform.param<i64>) -> !transform.param<i64>
     // expected-remark @below {{4}}
-    test_print_number_of_associated_payload_ir_params %7 : !transform.param<i64>
+    test_print_param %p4 : !transform.param<i64>
   }
 }
 
@@ -1712,21 +1730,25 @@ transform.sequence failures(propagate) {
   %3 = test_produce_value_handle_to_result %1, 1 : (!transform.any_op) -> !transform.any_value
 
   %4 = transform.merge_handles %2, %2 { deduplicate } : !transform.any_value
+  %p = num_associations %4 : (!transform.any_value) -> !transform.param<i64>
   // expected-remark @below {{1}}
-  test_print_number_of_associated_payload_ir_values %4 : !transform.any_value
+  test_print_param %p : !transform.param<i64>
 
   %5 = transform.merge_handles %2, %3 { deduplicate } : !transform.any_value
+  %p2 = num_associations %5 : (!transform.any_value) -> !transform.param<i64>
   // expected-remark @below {{2}}
-  test_print_number_of_associated_payload_ir_values %5 : !transform.any_value
+  test_print_param %p2 : !transform.param<i64>
 
   %6 = test_produce_value_handle_to_result %1, 0 : (!transform.any_op) -> !transform.any_value
   %7 = transform.merge_handles %2, %6 { deduplicate } : !transform.any_value
+  %p3 = num_associations %6 : (!transform.any_value) -> !transform.param<i64>
   // expected-remark @below {{1}}
-  test_print_number_of_associated_payload_ir_values %6 : !transform.any_value
+  test_print_param %p3 : !transform.param<i64>
 
   %8 = transform.merge_handles %2, %2, %3, %4 : !transform.any_value
+  %p4 = num_associations %8 : (!transform.any_value) -> !transform.param<i64>
   // expected-remark @below {{4}}
-  test_print_number_of_associated_payload_ir_values %8 : !transform.any_value
+  test_print_param %p4 : !transform.param<i64>
 }
 // -----
 
@@ -1820,31 +1842,37 @@ transform.sequence failures(propagate) {
 
   // There are 3 arith.constant ops.
   %all = transform.structured.match ops{["arith.constant"]} in %0 : (!transform.any_op) -> !transform.any_op
+  %p = num_associations %all : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{3}}
-  test_print_number_of_associated_payload_ir_ops %all : !transform.any_op
+  test_print_param %p : !transform.param<i64>
   // "deduplicate" has no effect because these are 3 different ops.
   %merged_before = transform.merge_handles deduplicate %all : !transform.any_op
+  %p2 = num_associations %merged_before : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{3}}
-  test_print_number_of_associated_payload_ir_ops %merged_before : !transform.any_op
+  test_print_param %p2 : !transform.param<i64>
 
   // Apply CSE.
   transform.apply_cse to %0 : !transform.any_op
 
   // The handle is still mapped to 3 arith.constant ops.
+  %p3 = num_associations %all : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{3}}
-  test_print_number_of_associated_payload_ir_ops %all : !transform.any_op
+  test_print_param %p3 : !transform.param<i64>
   // But they are all the same op.
   %merged_after = transform.merge_handles deduplicate %all : !transform.any_op
+  %p4 = num_associations %merged_after : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{1}}
-  test_print_number_of_associated_payload_ir_ops %merged_after : !transform.any_op
+  test_print_param %p4 : !transform.param<i64>
 
   // The other handles were also updated.
   test_print_remark_at_operand %elim_first, "eliminated 1" : !transform.any_op
+  %p5 = num_associations %elim_first : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{1}}
-  test_print_number_of_associated_payload_ir_ops %elim_first : !transform.any_op
+  test_print_param %p5 : !transform.param<i64>
   test_print_remark_at_operand %elim_second, "eliminated 2" : !transform.any_op
+  %p6 = num_associations %elim_second : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{1}}
-  test_print_number_of_associated_payload_ir_ops %elim_second : !transform.any_op
+  test_print_param %p6 : !transform.param<i64>
 }
 
 // -----
@@ -1907,14 +1935,16 @@ transform.sequence failures(propagate) {
   // Get immediate parent.
   %2 = transform.get_parent_op %0 : (!transform.any_op) -> !transform.any_op
   test_print_remark_at_operand %2, "direct parent" : !transform.any_op
+  %p = num_associations %2 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{2}}
-  test_print_number_of_associated_payload_ir_ops %2 : !transform.any_op
+  test_print_param %p : !transform.param<i64>
 
   // Deduplicate results.
   %3 = transform.structured.match ops{["test.qux"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   %4 = transform.get_parent_op %3 {deduplicate} : (!transform.any_op) -> !transform.any_op
+  %p2 = num_associations %4 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{1}}
-  test_print_number_of_associated_payload_ir_ops %4 : !transform.any_op
+  test_print_param %p2 : !transform.param<i64>
 }
 
 
@@ -2029,8 +2059,9 @@ transform.sequence failures(propagate) {
   // Match all ops inside the function (including the function itself).
   %func_op = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
   %0 = transform.structured.match in %func_op : (!transform.any_op) -> !transform.any_op
+  %p = num_associations %0 : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{5}}
-  test_print_number_of_associated_payload_ir_ops %0 : !transform.any_op
+  test_print_param %p : !transform.param<i64>
 
   // Select "test.foo".
   %foo = transform.select "test.foo" in %0 : (!transform.any_op) -> !transform.any_op
@@ -2060,8 +2091,9 @@ transform.sequence failures(propagate) {
   %empty_op = transform.structured.match ops{["tensor.empty"]} in %func_op : (!transform.any_op) -> !transform.any_op
   transform.apply_dce to %func_op : !transform.any_op
 
+  %p = num_associations %empty_op : (!transform.any_op) -> !transform.param<i64>
   // expected-remark @below{{0}}
-  test_print_number_of_associated_payload_ir_ops %empty_op : !transform.any_op
+  test_print_param %p : !transform.param<i64>
 }
 
 
diff --git a/mlir/test/Dialect/Transform/test-loop-transforms.mlir b/mlir/test/Dialect/Transform/test-loop-transforms.mlir
index 425962757f720..c34f4baf8cd97 100644
--- a/mlir/test/Dialect/Transform/test-loop-transforms.mlir
+++ b/mlir/test/Dialect/Transform/test-loop-transforms.mlir
@@ -37,13 +37,16 @@ module attributes {transform.with_named_sequence} {
     // Make sure that the handles are still valid (and were updated in case of
     // the loop).
 
+    %p = transform.num_associations %0 : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below{{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %0 : !transform.any_op
+    transform.test_print_param %p : !transform.param<i64>
     transform.test_print_remark_at_operand %0, "new loop op" : !transform.any_op
+    %p2 = transform.num_associations %1 : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below{{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %1 : !transform.any_op
+    transform.test_print_param %p2 : !transform.param<i64>
+    %p3 = transform.num_associations %2 : (!transform.any_op) -> !transform.param<i64>
     // expected-remark @below{{1}}
-    transform.test_print_number_of_associated_payload_ir_ops %2 : !transform.any_op
+    transform.test_print_param %p3 : !transform.param<i64>
 
     transform.yield
   }
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
index e8c25aca23725..9c69164e33f60 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
@@ -456,51 +456,6 @@ mlir::test::TestMixedSuccessAndSilenceableOp::applyToOne(
   return emitDefaultSilenceableFailure(target);
 }
 
-DiagnosedSilenceableFailure
-mlir::test::TestPrintNumberOfAssociatedPayloadIROps::apply(
-    transform::TransformRewriter &rewriter,
-    transform::TransformResults &results, transform::TransformState &state) {
-  if (!getHandle())
-    emitRemark() << 0;
-  emitRemark() << llvm::range_size(state.getPayloadOps(getHandle()));
-  return DiagnosedSilenceableFailure::success();
-}
-
-void mlir::test::TestPrintNumberOfAssociatedPayloadIROps::getEffects(
-    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  transform::onlyReadsHandle(getHandle(), effects);
-}
-
-DiagnosedSilenceableFailure
-mlir::test::TestPrintNumberOfAssociatedPayloadIRValues::apply(
-    transform::TransformRewriter &rewriter,
-    transform::TransformResults &results, transform::TransformState &state) {
-  if (!getValueHandle())
-    emitRemark() << 0;
-  emitRemark() << llvm::range_size(state.getPayloadValues(getValueHandle()));
-  return DiagnosedSilenceableFailure::success();
-}
-
-void mlir::test::TestPrintNumberOfAssociatedPayloadIRValues::getEffects(
-    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  transform::onlyReadsHandle(getValueHandle(), effects);
-}
-
-DiagnosedSilenceableFailure
-mlir::test::TestPrintNumberOfAssociatedPayloadIRParams::apply(
-    transform::TransformRewriter &rewriter,
-    transform::TransformResults &results, transform::TransformState &state) {
-  if (!getParam())
-    emitRemark() << 0;
-  emitRemark() << llvm::range_size(state.getParams(getParam()));
-  return DiagnosedSilenceableFailure::success();
-}
-
-void mlir::test::TestPrintNumberOfAssociatedPayloadIRParams::getEffects(
-    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  transform::onlyReadsHandle(getParam(), effects);
-}
-
 DiagnosedSilenceableFailure
 mlir::test::TestCopyPayloadOp::apply(transform::TransformRewriter &rewriter,
                                      transform::TransformResults &results,
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
index 41f318db68405..5cb47659fdbdf 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
@@ -343,33 +343,6 @@ def TestMixedSuccessAndSilenceableOp
   }];
 }
 
-def TestPrintNumberOfAssociatedPayloadIROps
-  : Op<Transform_Dialect, "test_print_number_of_associated_payload_ir_ops",
-       [DeclareOpInterfaceMethods<TransformOpInterface>,
-        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  let arguments = (ins TransformHandleTypeInterface:$handle);
-  let assemblyFormat = "$handle attr-dict `:` type($handle)";
-  let cppNamespace = "::mlir::test";
-}
-
-def TestPrintNumberOfAssociatedPayloadIRValues
-  : Op<Transform_Dialect, "test_print_number_of_associated_payload_ir_values",
-       [DeclareOpInterfaceMethods<TransformOpInterface>,
-        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  let arguments = (ins TransformValueHandleTypeInterface:$value_handle);
-  let assemblyFormat = "$value_handle attr-dict `:` type($value_handle)";
-  let cppNamespace = "::mlir::test";
-}
-
-def TestPrintNumberOfAssociatedPayloadIRParams
-  : Op<Transform_Dialect, "test_print_number_of_associated_payload_ir_params",
-       [DeclareOpInterfaceMethods<TransformOpInterface>,
-        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  let arguments = (ins TransformParamTypeInterface:$param);
-  let assemblyFormat = "$param attr-dict `:` type($param)";
-  let cppNamespace = "::mlir::test";
-}
-
 def TestCopyPayloadOp
   : Op<Transform_Dialect, "test_copy_payload",
        [DeclareOpInterfaceMethods<TransformOpInterface>,

From 0408a8578383539e9197cb27f59c876e474875f5 Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Wed, 3 Jan 2024 20:38:15 +0800
Subject: [PATCH 126/313] [clang][analyzer][NFC] Improve release note (#76805)

---
 clang/docs/ReleaseNotes.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0c8fec691bf3c..e00fc5ba79169 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1127,9 +1127,10 @@ Improvements
 ^^^^^^^^^^^^
 
 - Improved the ``unix.StdCLibraryFunctions`` checker by modeling more
-  functions like ``send``, ``recv``, ``readlink``, ``fflush`` and
+  functions like ``send``, ``recv``, ``readlink``, ``fflush``, ``mkdtemp`` and
   ``errno`` behavior.
   (`52ac71f92d38 <https://github.com/llvm/llvm-project/commit/52ac71f92d38f75df5cb88e9c090ac5fd5a71548>`_,
+  `#76671 <https://github.com/llvm/llvm-project/pull/76671>`_,
   `#71373 <https://github.com/llvm/llvm-project/pull/71373>`_,
   `#76557 <https://github.com/llvm/llvm-project/pull/76557>`_,
   `#71392 <https://github.com/llvm/llvm-project/pull/71392>`_)

From 1d27669e8ad07f8f2372d5012384764f20a6f0cb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 3 Jan 2024 12:37:55 +0000
Subject: [PATCH 127/313] [X86] combineConcatVectorOps - fold 512-bit
 concat(blendi(x,y,c0),blendi(z,w,c1)) to AVX512BW mask select

Yet another yak shaving regression fix for #73509
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  13 +
 .../vector-interleaved-store-i8-stride-8.ll   | 393 ++++++++++++------
 2 files changed, 281 insertions(+), 125 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3ca1c808fff7a..cd56529bfa0fd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55015,6 +55015,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
       }
       break;
+    case X86ISD::BLENDI:
+      if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
+        uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
+        uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
+        uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
+        MVT MaskSVT = MVT::getIntegerVT(VT.getVectorNumElements());
+        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+        SDValue Sel =
+            DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
+        return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
+                             ConcatSubOperand(VT, Ops, 0));
+      }
+      break;
     case ISD::VSELECT:
       if (!IsSplat && Subtarget.hasAVX512() &&
           (VT.is256BitVector() ||
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 257c9a2e4eaa5..7bb387481ff7b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -572,45 +572,45 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512-FAST-LABEL: store_i8_stride8_vf8:
-; AVX512-FAST:       # %bb.0:
-; AVX512-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
-; AVX512-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7]
-; AVX512-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm3
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15>
-; AVX512-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u>
-; AVX512-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
-; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; AVX512-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
-; AVX512-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm1
-; AVX512-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX512-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm0
-; AVX512-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512-FAST-NEXT:    vzeroupper
-; AVX512-FAST-NEXT:    retq
+; AVX512F-FAST-LABEL: store_i8_stride8_vf8:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512F-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512F-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7]
+; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm3
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
+; AVX512F-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
 ;
 ; AVX512BW-SLOW-LABEL: store_i8_stride8_vf8:
 ; AVX512BW-SLOW:       # %bb.0:
@@ -629,32 +629,61 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512BW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u,22,30,u,u,u,u,u,u,23,31]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,22,30,u,u,u,u,u,u,23,31,u,u]
-; AVX512BW-SLOW-NEXT:    movw $17544, %cx # imm = 0x4488
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512BW-SLOW-NEXT:    movl $287445282, %ecx # imm = 0x11221122
 ; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %ymm4, %ymm2 {%k1}
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm0[4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u,22,30,u,u,u,u,u,u,23,31,u,u,u,u]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm5[u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,22,30,u,u,u,u,u,u,23,31,u,u,u,u,u,u]
-; AVX512BW-SLOW-NEXT:    movw $4386, %cx # imm = 0x1122
-; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %ymm6, %ymm4 {%k2}
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27]
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u]
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %ymm3, %ymm1 {%k1}
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u]
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u]
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k2}
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,0,1,2,3,0,1]
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3,0,1,2,3]
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512BW-SLOW-NEXT:    movl $1149781128, %ecx # imm = 0x44884488
+; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k1
+; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
+; AVX512BW-SLOW-NEXT:    movw $-21846, %cx # imm = 0xAAAA
+; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k1
+; AVX512BW-SLOW-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512BW-SLOW-NEXT:    vzeroupper
 ; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: store_i8_stride8_vf8:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
+; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15]
+; AVX512BW-FAST-NEXT:    vpermd %zmm1, %zmm2, %zmm1
+; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7]
+; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u]
+; AVX512BW-FAST-NEXT:    movw $-21846, %cx # imm = 0xAAAA
+; AVX512BW-FAST-NEXT:    kmovd %ecx, %k1
+; AVX512BW-FAST-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
   %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64
   %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64
@@ -1051,69 +1080,182 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512-LABEL: store_i8_stride8_vf16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512-NEXT:    vmovdqa (%r11), %xmm3
-; AVX512-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
-; AVX512-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
-; AVX512-NEXT:    vinserti128 $1, (%r10), %ymm3, %ymm3
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
-; AVX512-NEXT:    vpshufb %ymm5, %ymm4, %ymm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
-; AVX512-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
-; AVX512-NEXT:    vpshufb %ymm10, %ymm9, %ymm11
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = <4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %ymm13, %ymm12, %ymm14
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11>
-; AVX512-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u>
-; AVX512-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u>
-; AVX512-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %ymm15, %ymm12, %ymm12
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
-; AVX512-NEXT:    vpshufb %ymm8, %ymm2, %ymm6
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm1, %ymm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm0, %ymm8
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX512-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: store_i8_stride8_vf16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa (%rdx), %xmm1
+; AVX512F-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512F-NEXT:    vmovdqa (%r11), %xmm3
+; AVX512F-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512F-NEXT:    vinserti128 $1, (%r10), %ymm3, %ymm3
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm6
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512F-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %ymm10, %ymm9, %ymm11
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %ymm13, %ymm12, %ymm14
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11>
+; AVX512F-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u>
+; AVX512F-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %ymm15, %ymm12, %ymm12
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512F-NEXT:    vpshufb %ymm8, %ymm2, %ymm6
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
+; AVX512F-NEXT:    vpshufb %ymm10, %ymm1, %ymm6
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
+; AVX512F-NEXT:    vpshufb %ymm13, %ymm0, %ymm8
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+; AVX512F-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
+; AVX512F-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
+; AVX512F-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-SLOW-LABEL: store_i8_stride8_vf16:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
+; AVX512BW-SLOW-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512BW-SLOW-NEXT:    vmovdqa (%r11), %xmm3
+; AVX512BW-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-SLOW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT:    vinserti128 $1, (%r10), %ymm3, %ymm3
+; AVX512BW-SLOW-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm3
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6]
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512BW-SLOW-NEXT:    vpshufb %zmm5, %zmm4, %zmm4
+; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm6 = zmm3[0,2,0,2,4,6,4,6]
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512BW-SLOW-NEXT:    vpshufb %zmm7, %zmm6, %zmm6
+; AVX512BW-SLOW-NEXT:    movl $8913032, %ecx # imm = 0x880088
+; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k1
+; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm6, %zmm4 {%k1}
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6]
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpshufb %zmm8, %zmm6, %zmm6
+; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm9 = zmm1[0,2,0,2,4,6,4,6]
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpshufb %zmm10, %zmm9, %zmm9
+; AVX512BW-SLOW-NEXT:    movl $2228258, %ecx # imm = 0x220022
+; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k2
+; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm9, %zmm6 {%k2}
+; AVX512BW-SLOW-NEXT:    movw $-21846, %cx # imm = 0xAAAA
+; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k3
+; AVX512BW-SLOW-NEXT:    vmovdqa32 %zmm4, %zmm6 {%k3}
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
+; AVX512BW-SLOW-NEXT:    vpshufb %zmm5, %zmm2, %zmm2
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7]
+; AVX512BW-SLOW-NEXT:    vpshufb %zmm7, %zmm3, %zmm3
+; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k1}
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
+; AVX512BW-SLOW-NEXT:    vpshufb %zmm8, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
+; AVX512BW-SLOW-NEXT:    vpshufb %zmm10, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512BW-SLOW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: store_i8_stride8_vf16:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-FAST-NEXT:    vmovdqa (%rdx), %xmm1
+; AVX512BW-FAST-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512BW-FAST-NEXT:    vmovdqa (%r11), %xmm3
+; AVX512BW-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-FAST-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
+; AVX512BW-FAST-NEXT:    vinserti128 $1, (%r10), %ymm3, %ymm3
+; AVX512BW-FAST-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm5
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14]
+; AVX512BW-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm3
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512BW-FAST-NEXT:    vpshufb %zmm7, %zmm3, %zmm3
+; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm8 = zmm2[0,2,0,2,4,6,4,6]
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512BW-FAST-NEXT:    vpshufb %zmm9, %zmm8, %zmm8
+; AVX512BW-FAST-NEXT:    movl $8913032, %ecx # imm = 0x880088
+; AVX512BW-FAST-NEXT:    kmovd %ecx, %k1
+; AVX512BW-FAST-NEXT:    vmovdqu16 %zmm3, %zmm8 {%k1}
+; AVX512BW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm1
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6]
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpshufb %zmm10, %zmm6, %zmm6
+; AVX512BW-FAST-NEXT:    movl $2228258, %ecx # imm = 0x220022
+; AVX512BW-FAST-NEXT:    kmovd %ecx, %k2
+; AVX512BW-FAST-NEXT:    vmovdqu16 %zmm1, %zmm6 {%k2}
+; AVX512BW-FAST-NEXT:    movw $-21846, %cx # imm = 0xAAAA
+; AVX512BW-FAST-NEXT:    kmovd %ecx, %k3
+; AVX512BW-FAST-NEXT:    vmovdqa32 %zmm8, %zmm6 {%k3}
+; AVX512BW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
+; AVX512BW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FAST-NEXT:    vpermq %zmm5, %zmm1, %zmm5
+; AVX512BW-FAST-NEXT:    vpshufb %zmm7, %zmm5, %zmm5
+; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
+; AVX512BW-FAST-NEXT:    vpshufb %zmm9, %zmm2, %zmm2
+; AVX512BW-FAST-NEXT:    vmovdqu16 %zmm5, %zmm2 {%k1}
+; AVX512BW-FAST-NEXT:    vpermq %zmm4, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
+; AVX512BW-FAST-NEXT:    vpshufb %zmm10, %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512BW-FAST-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
   %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
   %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64
@@ -6988,6 +7130,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX1: {{.*}}
+; AVX512: {{.*}}
+; AVX512-FAST: {{.*}}
 ; AVX512-SLOW: {{.*}}
 ; AVX512BW: {{.*}}
 ; AVX512BW-ONLY: {{.*}}
@@ -6997,7 +7141,6 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQBW-FAST: {{.*}}
 ; AVX512DQBW-ONLY: {{.*}}
 ; AVX512DQBW-SLOW: {{.*}}
-; AVX512F: {{.*}}
 ; AVX512F-ONLY: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}

From d09315d986fea239e462da454334482abc4ad461 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <bjorn.a.pettersson@ericsson.com>
Date: Wed, 3 Jan 2024 13:46:29 +0100
Subject: [PATCH 128/313] [opt][NewPM] Add isRequired to passes named as
 *PrinterPass (#76516)

Passes that print the result of analysis passes should be of interest, and
are expected to run even if a function for example is marked as optnone.
So when adding such passes explicitly to a pipeline it makes sense to
run the pass regardless of standard instrumentation gates such as
OptNoneInstrumentation.

In this patch all passes named as *PrinterPass are marked as required.
That should make sure that those passes are executed
without being skipped due to standard instrumentations.

The polly passes are not touched in this patch.

Partial fix for: https://github.com/llvm/llvm-project/issues/76762
---
 llvm/include/llvm/Analysis/AliasSetTracker.h                  | 1 +
 llvm/include/llvm/Analysis/AssumptionCache.h                  | 2 ++
 llvm/include/llvm/Analysis/BlockFrequencyInfo.h               | 2 ++
 llvm/include/llvm/Analysis/BranchProbabilityInfo.h            | 2 ++
 llvm/include/llvm/Analysis/CFGSCCPrinter.h                    | 1 +
 llvm/include/llvm/Analysis/CallGraph.h                        | 4 ++++
 llvm/include/llvm/Analysis/CallPrinter.h                      | 2 ++
 llvm/include/llvm/Analysis/CostModel.h                        | 2 ++
 llvm/include/llvm/Analysis/CycleAnalysis.h                    | 2 ++
 llvm/include/llvm/Analysis/DDG.h                              | 1 +
 llvm/include/llvm/Analysis/DDGPrinter.h                       | 1 +
 llvm/include/llvm/Analysis/Delinearization.h                  | 1 +
 llvm/include/llvm/Analysis/DemandedBits.h                     | 2 ++
 llvm/include/llvm/Analysis/DependenceAnalysis.h               | 2 ++
 llvm/include/llvm/Analysis/DominanceFrontier.h                | 2 ++
 llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h       | 2 ++
 llvm/include/llvm/Analysis/IRSimilarityIdentifier.h           | 1 +
 llvm/include/llvm/Analysis/InlineAdvisor.h                    | 3 ++-
 llvm/include/llvm/Analysis/InlineCost.h                       | 1 +
 llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h      | 2 ++
 llvm/include/llvm/Analysis/LazyCallGraph.h                    | 4 ++++
 llvm/include/llvm/Analysis/LazyValueInfo.h                    | 2 ++
 llvm/include/llvm/Analysis/LoopCacheAnalysis.h                | 2 ++
 llvm/include/llvm/Analysis/LoopInfo.h                         | 1 +
 llvm/include/llvm/Analysis/LoopNestAnalysis.h                 | 2 ++
 llvm/include/llvm/Analysis/MemDerefPrinter.h                  | 1 +
 llvm/include/llvm/Analysis/MemorySSA.h                        | 4 ++++
 llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h           | 1 +
 llvm/include/llvm/Analysis/MustExecute.h                      | 2 ++
 llvm/include/llvm/Analysis/PhiValues.h                        | 1 +
 llvm/include/llvm/Analysis/PostDominators.h                   | 2 ++
 llvm/include/llvm/Analysis/ProfileSummaryInfo.h               | 1 +
 llvm/include/llvm/Analysis/RegionInfo.h                       | 2 ++
 llvm/include/llvm/Analysis/ScalarEvolution.h                  | 2 ++
 llvm/include/llvm/Analysis/StackLifetime.h                    | 1 +
 llvm/include/llvm/Analysis/StackSafetyAnalysis.h              | 2 ++
 llvm/include/llvm/Analysis/StructuralHash.h                   | 2 ++
 llvm/include/llvm/Analysis/UniformityAnalysis.h               | 2 ++
 llvm/include/llvm/IR/Dominators.h                             | 2 ++
 llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h          | 1 +
 .../llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h        | 1 +
 llvm/include/llvm/Transforms/Utils/PredicateInfo.h            | 1 +
 llvm/lib/Target/DirectX/DXILResourceAnalysis.h                | 1 +
 43 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/AliasSetTracker.h b/llvm/include/llvm/Analysis/AliasSetTracker.h
index 4a952ccae7a0a..8afe455e2f082 100644
--- a/llvm/include/llvm/Analysis/AliasSetTracker.h
+++ b/llvm/include/llvm/Analysis/AliasSetTracker.h
@@ -411,6 +411,7 @@ class AliasSetsPrinterPass : public PassInfoMixin<AliasSetsPrinterPass> {
 public:
   explicit AliasSetsPrinterPass(raw_ostream &OS);
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/AssumptionCache.h b/llvm/include/llvm/Analysis/AssumptionCache.h
index 12dd9b04c9323..96ae32da6743a 100644
--- a/llvm/include/llvm/Analysis/AssumptionCache.h
+++ b/llvm/include/llvm/Analysis/AssumptionCache.h
@@ -189,6 +189,8 @@ class AssumptionPrinterPass : public PassInfoMixin<AssumptionPrinterPass> {
   explicit AssumptionPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// An immutable pass that tracks lazily created \c AssumptionCache
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfo.h b/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
index 95d75b0e1854f..179fd06addec8 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
@@ -134,6 +134,8 @@ class BlockFrequencyPrinterPass
   explicit BlockFrequencyPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Legacy analysis pass which computes \c BlockFrequencyInfo.
diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index fb02997371bfb..6b9d178182011 100644
--- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -436,6 +436,8 @@ class BranchProbabilityPrinterPass
   explicit BranchProbabilityPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Legacy analysis pass which computes \c BranchProbabilityInfo.
diff --git a/llvm/include/llvm/Analysis/CFGSCCPrinter.h b/llvm/include/llvm/Analysis/CFGSCCPrinter.h
index d98071461f578..0ea0b46c46269 100644
--- a/llvm/include/llvm/Analysis/CFGSCCPrinter.h
+++ b/llvm/include/llvm/Analysis/CFGSCCPrinter.h
@@ -19,6 +19,7 @@ class CFGSCCPrinterPass : public PassInfoMixin<CFGSCCPrinterPass> {
 public:
   explicit CFGSCCPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Analysis/CallGraph.h b/llvm/include/llvm/Analysis/CallGraph.h
index 9413b39978e3f..887743774175d 100644
--- a/llvm/include/llvm/Analysis/CallGraph.h
+++ b/llvm/include/llvm/Analysis/CallGraph.h
@@ -322,6 +322,8 @@ class CallGraphPrinterPass : public PassInfoMixin<CallGraphPrinterPass> {
   explicit CallGraphPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Printer pass for the summarized \c CallGraphAnalysis results.
@@ -333,6 +335,8 @@ class CallGraphSCCsPrinterPass
   explicit CallGraphSCCsPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// The \c ModulePass which wraps up a \c CallGraph and the logic to
diff --git a/llvm/include/llvm/Analysis/CallPrinter.h b/llvm/include/llvm/Analysis/CallPrinter.h
index d325d00103717..95cb5cc3ca862 100644
--- a/llvm/include/llvm/Analysis/CallPrinter.h
+++ b/llvm/include/llvm/Analysis/CallPrinter.h
@@ -24,12 +24,14 @@ class ModulePass;
 class CallGraphDOTPrinterPass : public PassInfoMixin<CallGraphDOTPrinterPass> {
 public:
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// Pass for viewing the call graph
 class CallGraphViewerPass : public PassInfoMixin<CallGraphViewerPass> {
 public:
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 ModulePass *createCallGraphViewerPass();
diff --git a/llvm/include/llvm/Analysis/CostModel.h b/llvm/include/llvm/Analysis/CostModel.h
index 649168050cec6..9b127c27ba7ef 100644
--- a/llvm/include/llvm/Analysis/CostModel.h
+++ b/llvm/include/llvm/Analysis/CostModel.h
@@ -20,6 +20,8 @@ class CostModelPrinterPass : public PassInfoMixin<CostModelPrinterPass> {
   explicit CostModelPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Analysis/CycleAnalysis.h b/llvm/include/llvm/Analysis/CycleAnalysis.h
index 099d7611dedc4..ce939eff8ff8f 100644
--- a/llvm/include/llvm/Analysis/CycleAnalysis.h
+++ b/llvm/include/llvm/Analysis/CycleAnalysis.h
@@ -68,6 +68,8 @@ class CycleInfoPrinterPass : public PassInfoMixin<CycleInfoPrinterPass> {
   explicit CycleInfoPrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h
index bc599cb1f9a15..bd559f3fb69b1 100644
--- a/llvm/include/llvm/Analysis/DDG.h
+++ b/llvm/include/llvm/Analysis/DDG.h
@@ -427,6 +427,7 @@ class DDGAnalysisPrinterPass : public PassInfoMixin<DDGAnalysisPrinterPass> {
   explicit DDGAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
+  static bool isRequired() { return true; }
 
 private:
   raw_ostream &OS;
diff --git a/llvm/include/llvm/Analysis/DDGPrinter.h b/llvm/include/llvm/Analysis/DDGPrinter.h
index d93c28280bac3..4aa154d173bad 100644
--- a/llvm/include/llvm/Analysis/DDGPrinter.h
+++ b/llvm/include/llvm/Analysis/DDGPrinter.h
@@ -29,6 +29,7 @@ class DDGDotPrinterPass : public PassInfoMixin<DDGDotPrinterPass> {
 public:
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
+  static bool isRequired() { return true; }
 };
 
 //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Analysis/Delinearization.h b/llvm/include/llvm/Analysis/Delinearization.h
index 95a36b8b79a4b..a00adb2896041 100644
--- a/llvm/include/llvm/Analysis/Delinearization.h
+++ b/llvm/include/llvm/Analysis/Delinearization.h
@@ -140,6 +140,7 @@ struct DelinearizationPrinterPass
     : public PassInfoMixin<DelinearizationPrinterPass> {
   explicit DelinearizationPrinterPass(raw_ostream &OS);
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 
 private:
   raw_ostream &OS;
diff --git a/llvm/include/llvm/Analysis/DemandedBits.h b/llvm/include/llvm/Analysis/DemandedBits.h
index 6e4bfcf899c90..aac7382528f06 100644
--- a/llvm/include/llvm/Analysis/DemandedBits.h
+++ b/llvm/include/llvm/Analysis/DemandedBits.h
@@ -120,6 +120,8 @@ class DemandedBitsPrinterPass : public PassInfoMixin<DemandedBitsPrinterPass> {
   explicit DemandedBitsPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 327315f831e11..f0a09644e0f4b 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -994,6 +994,8 @@ namespace llvm {
 
     PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 
+    static bool isRequired() { return true; }
+
   private:
     raw_ostream &OS;
     bool NormalizeResults;
diff --git a/llvm/include/llvm/Analysis/DominanceFrontier.h b/llvm/include/llvm/Analysis/DominanceFrontier.h
index db0130e4804b8..b65cdc9cdb3c4 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontier.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontier.h
@@ -204,6 +204,8 @@ class DominanceFrontierPrinterPass
   explicit DominanceFrontierPrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
index 3e9eb93745638..f5fbbdcb7143d 100644
--- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
@@ -157,6 +157,8 @@ class FunctionPropertiesPrinterPass
   explicit FunctionPropertiesPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Correctly update FunctionPropertiesInfo post-inlining. A
diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
index ad137baff5d42..0d19de6edc2a7 100644
--- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -1198,6 +1198,7 @@ class IRSimilarityAnalysisPrinterPass
 public:
   explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h
index 2740106bc7db8..5f36ee6f68abb 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -341,7 +341,7 @@ class InlineAdvisorAnalysis : public AnalysisInfoMixin<InlineAdvisorAnalysis> {
   Result run(Module &M, ModuleAnalysisManager &MAM) { return Result(M, MAM); }
 };
 
-/// Printer pass for the FunctionPropertiesAnalysis results.
+/// Printer pass for the InlineAdvisorAnalysis results.
 class InlineAdvisorAnalysisPrinterPass
     : public PassInfoMixin<InlineAdvisorAnalysisPrinterPass> {
   raw_ostream &OS;
@@ -353,6 +353,7 @@ class InlineAdvisorAnalysisPrinterPass
 
   PreservedAnalyses run(LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
+  static bool isRequired() { return true; }
 };
 
 std::unique_ptr<InlineAdvisor>
diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h
index 3f0bb879e021f..3a760e0a85cec 100644
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@@ -343,6 +343,7 @@ struct InlineCostAnnotationPrinterPass
 public:
   explicit InlineCostAnnotationPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  static bool isRequired() { return true; }
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h b/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
index 0aae696a98a92..b44edd370dd1c 100644
--- a/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
+++ b/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
@@ -40,6 +40,8 @@ class InlineSizeEstimatorAnalysisPrinterPass
   explicit InlineSizeEstimatorAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 } // namespace llvm
 #endif // LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H
diff --git a/llvm/include/llvm/Analysis/LazyCallGraph.h b/llvm/include/llvm/Analysis/LazyCallGraph.h
index 211a058aa0173..68c98b416ce96 100644
--- a/llvm/include/llvm/Analysis/LazyCallGraph.h
+++ b/llvm/include/llvm/Analysis/LazyCallGraph.h
@@ -1288,6 +1288,8 @@ class LazyCallGraphPrinterPass
   explicit LazyCallGraphPrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// A pass which prints the call graph as a DOT file to a \c raw_ostream.
@@ -1301,6 +1303,8 @@ class LazyCallGraphDOTPrinterPass
   explicit LazyCallGraphDOTPrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h
index 25a2c9ffa534a..5611a2b98020f 100644
--- a/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -157,6 +157,8 @@ class LazyValueInfoPrinterPass
   explicit LazyValueInfoPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Wrapper around LazyValueInfo.
diff --git a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
index c9e853b9be8e6..4fd2485e39d6d 100644
--- a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
@@ -291,6 +291,8 @@ class LoopCachePrinterPass : public PassInfoMixin<LoopCachePrinterPass> {
 
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
+
+  static bool isRequired() { return true; }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index 3b106381fbca3..c3bfd9df86d07 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -580,6 +580,7 @@ class LoopPrinterPass : public PassInfoMixin<LoopPrinterPass> {
 public:
   explicit LoopPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// Verifier pass for the \c LoopAnalysis results.
diff --git a/llvm/include/llvm/Analysis/LoopNestAnalysis.h b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
index 852a6c438d43a..3b33dd505ddeb 100644
--- a/llvm/include/llvm/Analysis/LoopNestAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
@@ -217,6 +217,8 @@ class LoopNestPrinterPass : public PassInfoMixin<LoopNestPrinterPass> {
 
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
+
+  static bool isRequired() { return true; }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Analysis/MemDerefPrinter.h b/llvm/include/llvm/Analysis/MemDerefPrinter.h
index bafdc543eeaf4..ba376dadb2a75 100644
--- a/llvm/include/llvm/Analysis/MemDerefPrinter.h
+++ b/llvm/include/llvm/Analysis/MemDerefPrinter.h
@@ -18,6 +18,7 @@ class MemDerefPrinterPass : public PassInfoMixin<MemDerefPrinterPass> {
 public:
   MemDerefPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index 94d7f1a78b847..531af5ac7380f 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -953,6 +953,8 @@ class MemorySSAPrinterPass : public PassInfoMixin<MemorySSAPrinterPass> {
       : OS(OS), EnsureOptimizedUses(EnsureOptimizedUses) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Printer pass for \c MemorySSA via the walker.
@@ -964,6 +966,8 @@ class MemorySSAWalkerPrinterPass
   explicit MemorySSAWalkerPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Verifier pass for \c MemorySSA.
diff --git a/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h b/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
index fa91e4f653d04..e69db780a2061 100644
--- a/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
+++ b/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
@@ -23,6 +23,7 @@ class ModuleDebugInfoPrinterPass
 public:
   explicit ModuleDebugInfoPrinterPass(raw_ostream &OS);
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Analysis/MustExecute.h b/llvm/include/llvm/Analysis/MustExecute.h
index 9c97bd1725ac2..468d94e7cd68b 100644
--- a/llvm/include/llvm/Analysis/MustExecute.h
+++ b/llvm/include/llvm/Analysis/MustExecute.h
@@ -547,6 +547,7 @@ class MustExecutePrinterPass : public PassInfoMixin<MustExecutePrinterPass> {
 public:
   MustExecutePrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 class MustBeExecutedContextPrinterPass
@@ -556,6 +557,7 @@ class MustBeExecutedContextPrinterPass
 public:
   MustBeExecutedContextPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Analysis/PhiValues.h b/llvm/include/llvm/Analysis/PhiValues.h
index ecbb8874b378e..a749af30be9e3 100644
--- a/llvm/include/llvm/Analysis/PhiValues.h
+++ b/llvm/include/llvm/Analysis/PhiValues.h
@@ -132,6 +132,7 @@ class PhiValuesPrinterPass : public PassInfoMixin<PhiValuesPrinterPass> {
 public:
   explicit PhiValuesPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// Wrapper pass for the legacy pass manager
diff --git a/llvm/include/llvm/Analysis/PostDominators.h b/llvm/include/llvm/Analysis/PostDominators.h
index 4383113c8db11..92e30f82501c1 100644
--- a/llvm/include/llvm/Analysis/PostDominators.h
+++ b/llvm/include/llvm/Analysis/PostDominators.h
@@ -68,6 +68,8 @@ class PostDominatorTreePrinterPass
   explicit PostDominatorTreePrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 struct PostDominatorTreeWrapperPass : public FunctionPass {
diff --git a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index e49538bfaf80f..73be9e1d74a33 100644
--- a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -389,6 +389,7 @@ class ProfileSummaryPrinterPass
 public:
   explicit ProfileSummaryPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/RegionInfo.h b/llvm/include/llvm/Analysis/RegionInfo.h
index 612b977f1ffa4..c49bbff8d63d4 100644
--- a/llvm/include/llvm/Analysis/RegionInfo.h
+++ b/llvm/include/llvm/Analysis/RegionInfo.h
@@ -983,6 +983,8 @@ class RegionInfoPrinterPass : public PassInfoMixin<RegionInfoPrinterPass> {
   explicit RegionInfoPrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Verifier pass for the \c RegionInfo.
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 4f1237c4b1f92..4d66bdcf2c010 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -2257,6 +2257,8 @@ class ScalarEvolutionPrinterPass
   explicit ScalarEvolutionPrinterPass(raw_ostream &OS) : OS(OS) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 class ScalarEvolutionWrapperPass : public FunctionPass {
diff --git a/llvm/include/llvm/Analysis/StackLifetime.h b/llvm/include/llvm/Analysis/StackLifetime.h
index 7fd88362276a1..438407fb70561 100644
--- a/llvm/include/llvm/Analysis/StackLifetime.h
+++ b/llvm/include/llvm/Analysis/StackLifetime.h
@@ -190,6 +190,7 @@ class StackLifetimePrinterPass
   StackLifetimePrinterPass(raw_ostream &OS, StackLifetime::LivenessType Type)
       : Type(Type), OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
   void printPipeline(raw_ostream &OS,
                      function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
diff --git a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
index 751735f3e59f6..2966f0c7e1610 100644
--- a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
+++ b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -105,6 +105,7 @@ class StackSafetyPrinterPass : public PassInfoMixin<StackSafetyPrinterPass> {
 public:
   explicit StackSafetyPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// StackSafetyInfo wrapper for the legacy pass manager
@@ -143,6 +144,7 @@ class StackSafetyGlobalPrinterPass
 public:
   explicit StackSafetyGlobalPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// This pass performs the global (interprocedural) stack safety analysis
diff --git a/llvm/include/llvm/Analysis/StructuralHash.h b/llvm/include/llvm/Analysis/StructuralHash.h
index 0eef17d637c8f..9f33c69aed345 100644
--- a/llvm/include/llvm/Analysis/StructuralHash.h
+++ b/llvm/include/llvm/Analysis/StructuralHash.h
@@ -24,6 +24,8 @@ class StructuralHashPrinterPass
       : OS(OS), EnableDetailedStructuralHash(Detailed) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+  static bool isRequired() { return true; }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Analysis/UniformityAnalysis.h b/llvm/include/llvm/Analysis/UniformityAnalysis.h
index f42c4950ed649..c38d100d88b8a 100644
--- a/llvm/include/llvm/Analysis/UniformityAnalysis.h
+++ b/llvm/include/llvm/Analysis/UniformityAnalysis.h
@@ -47,6 +47,8 @@ class UniformityInfoPrinterPass
   explicit UniformityInfoPrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Legacy analysis pass which computes a \ref CycleInfo.
diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h
index 8784a425d2841..9e000f513d570 100644
--- a/llvm/include/llvm/IR/Dominators.h
+++ b/llvm/include/llvm/IR/Dominators.h
@@ -293,6 +293,8 @@ class DominatorTreePrinterPass
   explicit DominatorTreePrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 
 /// Verifier pass for the \c DominatorTree.
diff --git a/llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h b/llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h
index 4136c45e19051..6bc01ececcf33 100644
--- a/llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h
+++ b/llvm/include/llvm/Transforms/Scalar/IVUsersPrinter.h
@@ -25,6 +25,7 @@ class IVUsersPrinterPass : public PassInfoMixin<IVUsersPrinterPass> {
   explicit IVUsersPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
+  static bool isRequired() { return true; }
 };
 }
 
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h b/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
index 4d1f934ae91d7..f445e0696b5f8 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
@@ -24,6 +24,7 @@ class LoopAccessInfoPrinterPass
 public:
   explicit LoopAccessInfoPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
index b433d2ec89dc0..0dabb97f6d11b 100644
--- a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -215,6 +215,7 @@ class PredicateInfoPrinterPass
 public:
   explicit PredicateInfoPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// Verifier pass for \c PredicateInfo.
diff --git a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
index 8ffa1d7cd9b35..bce41160b95ec 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
+++ b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
@@ -36,6 +36,7 @@ class DXILResourcePrinterPass : public PassInfoMixin<DXILResourcePrinterPass> {
 public:
   explicit DXILResourcePrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// The legacy pass manager's analysis pass to compute DXIL resource

From 6b21948f26d69bad8c282db375906a8e0712d5f8 Mon Sep 17 00:00:00 2001
From: Rik Huijzer <github@huijzer.xyz>
Date: Wed, 3 Jan 2024 13:46:52 +0100
Subject: [PATCH 129/313] [mlir][vector] Fix invalid `LoadOp` indices being
 created (#76292)

Fixes https://github.com/llvm/llvm-project/issues/71326.

This is the second PR. The first PR at
https://github.com/llvm/llvm-project/pull/75519 was reverted because an
integration test failed. The failed integration test was simplified and
added to the core MLIR tests. Compared to the first PR, the current PR
uses a more reliable approach. In summary, the current PR determines the
mask indices by looking up the _mask_ buffer load indices from the
previous iteration, whereas `main` looks up the indices for the _data_
buffer. The mask and data indices can differ when using a
`permutation_map`.

The cause of the issue was that a new `LoadOp` was created which looked
something like:
```mlir
func.func main(%arg1 : index, %arg2 : index) {
  %alloca_0 = memref.alloca() : memref<vector<1x32xi1>>
  %1 = vector.type_cast %alloca_0 : memref<vector<1x32xi1>> to memref<1xvector<32xi1>>
  %2 = memref.load %1[%arg1, %arg2] : memref<1xvector<32xi1>>
  return
}
```
which crashed inside the `LoadOp::verify`. Note here that `%alloca_0` is
the mask as can be seen from the `i1` element type and note it is 0
dimensional. Next, `%1` has one dimension, but `memref.load` tries to
index it with two indices.

This issue occured in the following code (a simplified version of the
bug report):
```mlir
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, 0, 0, d3)>
func.func @main(%subview:  memref<1x1x1x1xi32>, %mask: vector<1x1xi1>) -> vector<1x1x1x1xi32> {
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %3 = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %c0_i32, %mask {permutation_map = #map1}
          : memref<1x1x1x1xi32>, vector<1x1x1x1xi32>
  return %3 : vector<1x1x1x1xi32>
}
```
After this patch, it is lowered to the following by
`-convert-vector-to-scf`:
```mlir
func.func @main(%arg0: memref<1x1x1x1xi32>, %arg1: vector<1x1xi1>) -> vector<1x1x1x1xi32> {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() : memref<vector<1x1x1x1xi32>>
  %alloca_0 = memref.alloca() : memref<vector<1x1xi1>>
  memref.store %arg1, %alloca_0[] : memref<vector<1x1xi1>>
  %0 = vector.type_cast %alloca : memref<vector<1x1x1x1xi32>> to memref<1xvector<1x1x1xi32>>
  %1 = vector.type_cast %alloca_0 : memref<vector<1x1xi1>> to memref<1xvector<1xi1>>
  scf.for %arg2 = %c0 to %c1 step %c1 {
    %3 = vector.type_cast %0 : memref<1xvector<1x1x1xi32>> to memref<1x1xvector<1x1xi32>>
    scf.for %arg3 = %c0 to %c1 step %c1 {
      %4 = vector.type_cast %3 : memref<1x1xvector<1x1xi32>> to memref<1x1x1xvector<1xi32>>
      scf.for %arg4 = %c0 to %c1 step %c1 {
        %5 = memref.load %1[%arg2] : memref<1xvector<1xi1>>
        %6 = vector.transfer_read %arg0[%arg2, %c0, %c0, %c0], %c0_i32, %5 {in_bounds = [true]} : memref<1x1x1x1xi32>, vector<1xi32>
        memref.store %6, %4[%arg2, %arg3, %arg4] : memref<1x1x1xvector<1xi32>>
      }
    }
  }
  %2 = memref.load %alloca[] : memref<vector<1x1x1x1xi32>>
  return %2 : vector<1x1x1x1xi32>
}
```
What was causing the problems is that one dimension of the data buffer
`%alloca` (eltype `i32`) is unpacked (`vector.type_cast`) inside the
outmost loop (loop with index variable `%arg2`) and the nested loop
(loop with index variable `%arg3`), whereas the mask buffer `%alloca_0`
(eltype `i1`) is not unpacked in these loops.

Before this patch, the load indices would be determined by looking up
the load indices for the *data* buffer load op. However, as shown in the
specific example, when a permutation map is specified then the load
indices from the data buffer load op start to differ from the indices
for the mask op. To fix this, this patch ensures that the load indices
for the *mask* buffer are used instead.

---------

Co-authored-by: Mehdi Amini <joker.eph@gmail.com>
---
 .../Conversion/VectorToSCF/VectorToSCF.cpp    | 48 +++++++++++++------
 .../Conversion/VectorToSCF/vector-to-scf.mlir | 37 ++++++++++++++
 2 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index 2ee314e9fedfe..a1aff1ab36a52 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -866,6 +866,31 @@ struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
     this->setHasBoundedRewriteRecursion();
   }
 
+  static void getMaskBufferLoadIndices(OpTy xferOp, Value castedMaskBuffer,
+                                       SmallVectorImpl<Value> &loadIndices,
+                                       Value iv) {
+    assert(xferOp.getMask() && "Expected transfer op to have mask");
+
+    // Add load indices from the previous iteration.
+    // The mask buffer depends on the permutation map, which makes determining
+    // the indices quite complex, so this is why we need to "look back" to the
+    // previous iteration to find the right indices.
+    Value maskBuffer = getMaskBuffer(xferOp);
+    for (Operation *user : maskBuffer.getUsers()) {
+      // If there is no previous load op, then the indices are empty.
+      if (auto loadOp = dyn_cast<memref::LoadOp>(user)) {
+        Operation::operand_range prevIndices = loadOp.getIndices();
+        loadIndices.append(prevIndices.begin(), prevIndices.end());
+        break;
+      }
+    }
+
+    // In case of broadcast: Use same indices to load from memref
+    // as before.
+    if (!xferOp.isBroadcastDim(0))
+      loadIndices.push_back(iv);
+  }
+
   LogicalResult matchAndRewrite(OpTy xferOp,
                                 PatternRewriter &rewriter) const override {
     if (!xferOp->hasAttr(kPassLabel))
@@ -873,9 +898,9 @@ struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
 
     // Find and cast data buffer. How the buffer can be found depends on OpTy.
     ImplicitLocOpBuilder locB(xferOp.getLoc(), rewriter);
-    auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
+    Value dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
     auto dataBufferType = dyn_cast<MemRefType>(dataBuffer.getType());
-    auto castedDataType = unpackOneDim(dataBufferType);
+    FailureOr<MemRefType> castedDataType = unpackOneDim(dataBufferType);
     if (failed(castedDataType))
       return failure();
 
@@ -885,8 +910,7 @@ struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
     // If the xferOp has a mask: Find and cast mask buffer.
     Value castedMaskBuffer;
     if (xferOp.getMask()) {
-      auto maskBuffer = getMaskBuffer(xferOp);
-      auto maskBufferType = dyn_cast<MemRefType>(maskBuffer.getType());
+      Value maskBuffer = getMaskBuffer(xferOp);
       if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) {
         // Do not unpack a dimension of the mask, if:
         // * To-be-unpacked transfer op dimension is a broadcast.
@@ -897,7 +921,8 @@ struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
       } else {
         // It's safe to assume the mask buffer can be unpacked if the data
         // buffer was unpacked.
-        auto castedMaskType = *unpackOneDim(maskBufferType);
+        auto maskBufferType = cast<MemRefType>(maskBuffer.getType());
+        MemRefType castedMaskType = *unpackOneDim(maskBufferType);
         castedMaskBuffer =
             locB.create<vector::TypeCastOp>(castedMaskType, maskBuffer);
       }
@@ -929,21 +954,16 @@ struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
 
                 // If old transfer op has a mask: Set mask on new transfer op.
                 // Special case: If the mask of the old transfer op is 1D and
-                // the
-                //               unpacked dim is not a broadcast, no mask is
-                //               needed on the new transfer op.
+                // the unpacked dim is not a broadcast, no mask is needed on
+                // the new transfer op.
                 if (xferOp.getMask() && (xferOp.isBroadcastDim(0) ||
                                          xferOp.getMaskType().getRank() > 1)) {
                   OpBuilder::InsertionGuard guard(b);
                   b.setInsertionPoint(newXfer); // Insert load before newXfer.
 
                   SmallVector<Value, 8> loadIndices;
-                  Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
-                  // In case of broadcast: Use same indices to load from memref
-                  // as before.
-                  if (!xferOp.isBroadcastDim(0))
-                    loadIndices.push_back(iv);
-
+                  getMaskBufferLoadIndices(xferOp, castedMaskBuffer,
+                                           loadIndices, iv);
                   auto mask = b.create<memref::LoadOp>(loc, castedMaskBuffer,
                                                        loadIndices);
                   rewriter.updateRootInPlace(newXfer, [&]() {
diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
index ad78f0c945b24..8316b4005cc16 100644
--- a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
@@ -740,6 +740,43 @@ func.func @cannot_lower_transfer_read_with_leading_scalable(%arg0: memref<?x4xf3
 
 //  -----
 
+// Check that the `TransferOpConversion` generates valid indices for the LoadOp.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, 0, 0, d3)>
+func.func @does_not_crash_on_unpack_one_dim(%subview:  memref<1x1x1x1xi32>, %mask: vector<1x1xi1>) -> vector<1x1x1x1xi32> {
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %3 = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %c0_i32, %mask {permutation_map = #map1}
+          : memref<1x1x1x1xi32>, vector<1x1x1x1xi32>
+  return %3 : vector<1x1x1x1xi32>
+}
+// CHECK-LABEL: func.func @does_not_crash_on_unpack_one_dim
+// CHECK: %[[ALLOCA_0:.*]] = memref.alloca() : memref<vector<1x1xi1>>
+// CHECK: %[[MASK:.*]] = vector.type_cast %[[ALLOCA_0]] : memref<vector<1x1xi1>> to memref<1xvector<1xi1>>
+// CHECK: memref.load %[[MASK]][%{{.*}}] : memref<1xvector<1xi1>>
+
+//  -----
+
+// Check that the `TransferOpConversion` generates valid indices for the StoreOp.
+// This test is pulled from an integration test for ArmSVE.
+
+func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x?xf32>) -> vector<1x2x[4]xf32> {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 2 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %dim_a = memref.dim %a, %c2 : memref<1x2x?xf32>
+  %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1>
+  %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
+  return %vector_a : vector<1x2x[4]xf32>
+}
+// CHECK-LABEL: func.func @add_arrays_of_scalable_vectors
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: memref.load
+
+//  -----
+
 // FULL-UNROLL-LABEL: @cannot_fully_unroll_transfer_write_of_nd_scalable_vector
 func.func @cannot_fully_unroll_transfer_write_of_nd_scalable_vector(%vec: vector<[4]x[4]xf32>, %memref: memref<?x?xf32>) {
   // FULL-UNROLL-NOT: vector.extract

From 11a702d215bf3fe5d185b9005e76c76c3001dd0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <bjorn.a.pettersson@ericsson.com>
Date: Wed, 3 Jan 2024 13:47:53 +0100
Subject: [PATCH 130/313] [opt][NewPM] Add isRequired to passes named as
 *VerifierPass (#76517)

In this patch all passes named as *VerifierPass are being marked as
required. That should make sure that the passes are executed without
being skipped due to standard instrumentations.

For example
  opt -passes='verify<loops>,verify<scalar-evolution>' ...
will no longer skip running the verifications for functions marked as
optnone.

Partial fix for: https://github.com/llvm/llvm-project/issues/76762
---
 llvm/include/llvm/Analysis/LoopInfo.h              | 1 +
 llvm/include/llvm/Analysis/MemorySSA.h             | 1 +
 llvm/include/llvm/Analysis/RegionInfo.h            | 1 +
 llvm/include/llvm/Analysis/ScalarEvolution.h       | 1 +
 llvm/include/llvm/IR/Dominators.h                  | 1 +
 llvm/include/llvm/IR/SafepointIRVerifier.h         | 2 ++
 llvm/include/llvm/Transforms/Utils/PredicateInfo.h | 1 +
 7 files changed, 8 insertions(+)

diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index c3bfd9df86d07..52084630560c5 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -586,6 +586,7 @@ class LoopPrinterPass : public PassInfoMixin<LoopPrinterPass> {
 /// Verifier pass for the \c LoopAnalysis results.
 struct LoopVerifierPass : public PassInfoMixin<LoopVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// The legacy pass manager's analysis pass to compute loop information.
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index 531af5ac7380f..caf0e31fd37d6 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -973,6 +973,7 @@ class MemorySSAWalkerPrinterPass
 /// Verifier pass for \c MemorySSA.
 struct MemorySSAVerifierPass : PassInfoMixin<MemorySSAVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// Legacy analysis pass which computes \c MemorySSA.
diff --git a/llvm/include/llvm/Analysis/RegionInfo.h b/llvm/include/llvm/Analysis/RegionInfo.h
index c49bbff8d63d4..fc8df36ec287d 100644
--- a/llvm/include/llvm/Analysis/RegionInfo.h
+++ b/llvm/include/llvm/Analysis/RegionInfo.h
@@ -990,6 +990,7 @@ class RegionInfoPrinterPass : public PassInfoMixin<RegionInfoPrinterPass> {
 /// Verifier pass for the \c RegionInfo.
 struct RegionInfoVerifierPass : PassInfoMixin<RegionInfoVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 template <>
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 4d66bdcf2c010..af3ad822e0b0d 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -2246,6 +2246,7 @@ class ScalarEvolutionVerifierPass
     : public PassInfoMixin<ScalarEvolutionVerifierPass> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// Printer pass for the \c ScalarEvolutionAnalysis results.
diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h
index 9e000f513d570..42db4c4ea3a55 100644
--- a/llvm/include/llvm/IR/Dominators.h
+++ b/llvm/include/llvm/IR/Dominators.h
@@ -300,6 +300,7 @@ class DominatorTreePrinterPass
 /// Verifier pass for the \c DominatorTree.
 struct DominatorTreeVerifierPass : PassInfoMixin<DominatorTreeVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 /// Enables verification of dominator trees.
diff --git a/llvm/include/llvm/IR/SafepointIRVerifier.h b/llvm/include/llvm/IR/SafepointIRVerifier.h
index 246d236adb389..2ee998e4c68ff 100644
--- a/llvm/include/llvm/IR/SafepointIRVerifier.h
+++ b/llvm/include/llvm/IR/SafepointIRVerifier.h
@@ -40,6 +40,8 @@ class SafepointIRVerifierPass : public PassInfoMixin<SafepointIRVerifierPass> {
   explicit SafepointIRVerifierPass() = default;
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
 };
 }
 
diff --git a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
index 0dabb97f6d11b..365b215c051fc 100644
--- a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -221,6 +221,7 @@ class PredicateInfoPrinterPass
 /// Verifier pass for \c PredicateInfo.
 struct PredicateInfoVerifierPass : PassInfoMixin<PredicateInfoVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm

From 7dc0ba949c1527808b7ceea58bdd72c9f3e2e300 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <bjorn.a.pettersson@ericsson.com>
Date: Wed, 3 Jan 2024 13:49:13 +0100
Subject: [PATCH 131/313] [LoopInfo][NewPM] Print function name in
 LoopPrinterPass (#76527)

The legacy pass manager printed the function name when printing
loop info (via -analyze option). Like this:

  Printing analysis 'Natural Loop Information' for function 'func':
  Loop at depth 1 containing: ...
      Loop at depth 2 containing: ...

Make sure we print such a first line including the function name
also when using the new pass manager version of LoopPrinterPass.

The format of the string is changed slightly, so now we say:

  Loop info for function 'func':
  Loop at depth 1 containing: ...
      Loop at depth 2 containing: ...

This was originally requested in
  https://discourse.llvm.org/t/need-usage-help-w-new-pass-manager-for-opt-analysis-natural-loop-information/75874/7
and also mentioned in
  https://github.com/llvm/llvm-project/issues/76762
---
 llvm/lib/Analysis/LoopInfo.cpp                           | 4 +++-
 llvm/test/Analysis/LoopInfo/annotated-parallel-simple.ll | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 87ddfe3e92ae9..59c96a3371e87 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -969,7 +969,9 @@ LoopInfo LoopAnalysis::run(Function &F, FunctionAnalysisManager &AM) {
 
 PreservedAnalyses LoopPrinterPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
-  AM.getResult<LoopAnalysis>(F).print(OS);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  OS << "Loop info for function '" << F.getName() << "':\n";
+  LI.print(OS);
   return PreservedAnalyses::all();
 }
 
diff --git a/llvm/test/Analysis/LoopInfo/annotated-parallel-simple.ll b/llvm/test/Analysis/LoopInfo/annotated-parallel-simple.ll
index d96ff492c53a7..24ed59c49ccc2 100644
--- a/llvm/test/Analysis/LoopInfo/annotated-parallel-simple.ll
+++ b/llvm/test/Analysis/LoopInfo/annotated-parallel-simple.ll
@@ -33,5 +33,5 @@ for.end:
 !7 = distinct !{!7, !9} ; LoopID
 !9 = !{!"llvm.loop.parallel_accesses", !6}
 
-
-; CHECK: Parallel Loop
+; CHECK: Loop info for function 'func':
+; CHECK: Parallel Loop at depth 1 containing:

From 98624914367bf9091919de330cf322fb6d5e468f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 3 Jan 2024 14:24:09 +0100
Subject: [PATCH 132/313] [BasicAA] Add tests for #76789 (NFC)

---
 .../Analysis/BasicAA/inttoptr_constexpr.ll    | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 llvm/test/Analysis/BasicAA/inttoptr_constexpr.ll

diff --git a/llvm/test/Analysis/BasicAA/inttoptr_constexpr.ll b/llvm/test/Analysis/BasicAA/inttoptr_constexpr.ll
new file mode 100644
index 0000000000000..afe3602d1c7e5
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/inttoptr_constexpr.ll
@@ -0,0 +1,49 @@
+; RUN: opt -passes=aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: NoAlias: i8* %a, i8* %gep
+define void @inttoptr_alloca() {
+  %a = alloca i8
+  %a.int = ptrtoint ptr %a to i64
+  %a.int.1 = add i64 %a.int, 1
+  %gep = getelementptr i8, ptr inttoptr (i64 -1 to ptr), i64 %a.int.1
+  %load = load i8, ptr %gep
+  store i8 1, ptr %a
+  ret void
+}
+
+; CHECK: NoAlias: i8* %a, i8* %gep
+define void @inttoptr_alloca_unknown_relation(i64 %offset) {
+  %a = alloca i8
+  %a.int = ptrtoint ptr %a to i64
+  %gep = getelementptr i8, ptr inttoptr (i64 -1 to ptr), i64 %offset
+  %load = load i8, ptr %gep
+  store i8 1, ptr %a
+  ret void
+}
+
+; CHECK: NoAlias: i8* %a, i8* %gep
+define void @inttoptr_alloca_noescape(i64 %offset) {
+  %a = alloca i8
+  %gep = getelementptr i8, ptr inttoptr (i64 -1 to ptr), i64 %offset
+  %load = load i8, ptr %gep
+  store i8 1, ptr %a
+  ret void
+}
+
+; CHECK: NoAlias: i8* %a, i8* %gep
+define void @inttoptr_noalias(ptr noalias %a) {
+  %a.int = ptrtoint ptr %a to i64
+  %a.int.1 = add i64 %a.int, 1
+  %gep = getelementptr i8, ptr inttoptr (i64 -1 to ptr), i64 %a.int.1
+  %load = load i8, ptr %gep
+  store i8 1, ptr %a
+  ret void
+}
+
+; CHECK: NoAlias: i8* %a, i8* %gep
+define void @inttoptr_noalias_noescape(ptr noalias %a, i64 %offset) {
+  %gep = getelementptr i8, ptr inttoptr (i64 -1 to ptr), i64 %offset
+  %load = load i8, ptr %gep
+  store i8 1, ptr %a
+  ret void
+}

From 3fbef5b8e98f7365e9424339f53950eb6c1b3c3c Mon Sep 17 00:00:00 2001
From: Ying Yi <ying.yi@sony.com>
Date: Tue, 12 Dec 2023 12:13:47 +0000
Subject: [PATCH 133/313] Fixed a signed integer overflow in BitstreamWriter.h
 which is found by UBSAN.

https://github.com/llvm/llvm-project/pull/75213
---
 llvm/include/llvm/Bitstream/BitstreamWriter.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Bitstream/BitstreamWriter.h b/llvm/include/llvm/Bitstream/BitstreamWriter.h
index f7d362b5d70ce..c726508cd5285 100644
--- a/llvm/include/llvm/Bitstream/BitstreamWriter.h
+++ b/llvm/include/llvm/Bitstream/BitstreamWriter.h
@@ -239,7 +239,8 @@ class BitstreamWriter {
 
     // Emit the bits with VBR encoding, NumBits-1 bits at a time.
     while (Val >= Threshold) {
-      Emit((Val & ((1 << (NumBits-1))-1)) | (1 << (NumBits-1)), NumBits);
+      Emit((Val & ((1U << (NumBits - 1)) - 1)) | (1U << (NumBits - 1)),
+           NumBits);
       Val >>= NumBits-1;
     }
 
@@ -255,7 +256,8 @@ class BitstreamWriter {
 
     // Emit the bits with VBR encoding, NumBits-1 bits at a time.
     while (Val >= Threshold) {
-      Emit(((uint32_t)Val & ((1 << (NumBits - 1)) - 1)) | (1 << (NumBits - 1)),
+      Emit(((uint32_t)Val & ((1U << (NumBits - 1)) - 1)) |
+               (1U << (NumBits - 1)),
            NumBits);
       Val >>= NumBits-1;
     }

From 923ff5574826767af8145aae361ca5d710c16e65 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Wed, 3 Jan 2024 14:33:20 +0100
Subject: [PATCH 134/313] [Tooling] Print the progress when there are multiple
 files to process (#75904)

Running clang tools on a single file can be slow. It is even worse when
running multiple files, to improve the user experience, we print the
processing status.
---
 clang/lib/Tooling/Tooling.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index d192c7f429396..d82cd5e886e46 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -554,6 +554,8 @@ int ClangTool::run(ToolAction *Action) {
                  << CWD.getError().message() << "\n";
   }
 
+  size_t NumOfTotalFiles = AbsolutePaths.size();
+  unsigned ProcessedFileCounter = 0;
   for (llvm::StringRef File : AbsolutePaths) {
     // Currently implementations of CompilationDatabase::getCompileCommands can
     // change the state of the file system (e.g.  prepare generated headers), so
@@ -609,7 +611,11 @@ int ClangTool::run(ToolAction *Action) {
 
       // FIXME: We need a callback mechanism for the tool writer to output a
       // customized message for each file.
-      LLVM_DEBUG({ llvm::dbgs() << "Processing: " << File << ".\n"; });
+      if (NumOfTotalFiles > 1)
+        llvm::errs() << "[" + std::to_string(++ProcessedFileCounter) + "/" +
+                            std::to_string(NumOfTotalFiles) +
+                            "] Processing file " + File
+                     << ".\n";
       ToolInvocation Invocation(std::move(CommandLine), Action, Files.get(),
                                 PCHContainerOps);
       Invocation.setDiagnosticConsumer(DiagConsumer);

From 6023e2476b5cb2fd84dcb74d805ae2e322160111 Mon Sep 17 00:00:00 2001
From: NimishMishra <42909663+NimishMishra@users.noreply.github.com>
Date: Wed, 3 Jan 2024 05:46:02 -0800
Subject: [PATCH 135/313] [flang] Error out when assumed rank variable in used
 as selector in SELECT TYPE statement (#74286)

This patch adds a check to error out when an assumed rank variable is
used as dummy argument.

Fixes https://github.com/llvm/llvm-project/issues/74285
---
 flang/lib/Semantics/check-select-type.cpp | 3 +++
 flang/test/Semantics/selecttype01.f90     | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/flang/lib/Semantics/check-select-type.cpp b/flang/lib/Semantics/check-select-type.cpp
index c67248ba62407..6515cf25e0d7d 100644
--- a/flang/lib/Semantics/check-select-type.cpp
+++ b/flang/lib/Semantics/check-select-type.cpp
@@ -258,6 +258,9 @@ void SelectTypeChecker::Enter(const parser::SelectTypeConstruct &construct) {
     if (IsProcedure(*selector)) {
       context_.Say(
           selectTypeStmt.source, "Selector may not be a procedure"_err_en_US);
+    } else if (evaluate::IsAssumedRank(*selector)) {
+      context_.Say(selectTypeStmt.source,
+          "Assumed-rank variable may only be used as actual argument"_err_en_US);
     } else if (auto exprType{selector->GetType()}) {
       const auto &typeCaseList{
           std::get<std::list<parser::SelectTypeConstruct::TypeCase>>(
diff --git a/flang/test/Semantics/selecttype01.f90 b/flang/test/Semantics/selecttype01.f90
index e8699f20620ce..93fd130204886 100644
--- a/flang/test/Semantics/selecttype01.f90
+++ b/flang/test/Semantics/selecttype01.f90
@@ -288,4 +288,11 @@ subroutine CheckNotProcedure
   function f() result(res)
     class(shape), allocatable :: res
   end
+
+subroutine CheckAssumedRankInSelectType(var)
+  class(*), intent(in) :: var(..)
+  !ERROR: Assumed-rank variable may only be used as actual argument
+  select type(var)
+  end select
+ end
 end

From 3ae3e40706aad2ce92002a0da24e39d271feb42b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 3 Jan 2024 13:57:52 +0000
Subject: [PATCH 136/313] [ConstraintSystem] Drop left-over parentheses. (NFC)

Remove left over redundant parentheses after 9b6127d7 as pointed out by
@nikic.
---
 llvm/lib/Analysis/ConstraintSystem.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp
index 35bdd869a88d1..1a9c7c21e9ced 100644
--- a/llvm/lib/Analysis/ConstraintSystem.cpp
+++ b/llvm/lib/Analysis/ConstraintSystem.cpp
@@ -95,14 +95,14 @@ bool ConstraintSystem::eliminateUsingFM() {
           IdxUpper++;
         }
 
-        if (MulOverflow(UpperV, ((-1) * LowerLast), M1))
+        if (MulOverflow(UpperV, -1 * LowerLast, M1))
           return false;
         if (IdxLower < LowerRow.size() && LowerRow[IdxLower].Id == CurrentId) {
           LowerV = LowerRow[IdxLower].Coefficient;
           IdxLower++;
         }
 
-        if (MulOverflow(LowerV, (UpperLast), M2))
+        if (MulOverflow(LowerV, UpperLast, M2))
           return false;
         if (AddOverflow(M1, M2, N))
           return false;

From a93c17c9ab7aa7b394313898e0db7aae6adeeaa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Wed, 3 Jan 2024 15:12:31 +0100
Subject: [PATCH 137/313] [llvm-jitlink-executor] Drop else block after
 noreturn-if (NFC) (#76689)

NFC follow-up from https://github.com/llvm/llvm-project/pull/76236
---
 .../llvm-jitlink-executor.cpp                 | 69 +++++++++----------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
index 034c43f9352a5..20205eed82aa7 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
@@ -54,9 +54,9 @@ void printErrorAndExit(Twine ErrMsg) {
   errs() << "error: " << ErrMsg.str() << "\n\n"
          << "Usage:\n"
          << "  llvm-jitlink-executor " << DebugOption
-         << "filedescs=<infd>,<outfd> [args...]\n"
+         << "[test-jitloadergdb] filedescs=<infd>,<outfd> [args...]\n"
          << "  llvm-jitlink-executor " << DebugOption
-         << "listen=<host>:<port> [args...]\n";
+         << "[test-jitloadergdb] listen=<host>:<port> [args...]\n";
   exit(1);
 }
 
@@ -132,51 +132,48 @@ int main(int argc, char *argv[]) {
   int InFD = 0;
   int OutFD = 0;
 
-  std::vector<StringRef> TestOutputFlags;
-
   if (argc < 2)
     printErrorAndExit("insufficient arguments");
-  else {
-    StringRef ConnectArg = argv[FirstProgramArg++];
+
+  StringRef NextArg = argv[FirstProgramArg++];
 #ifndef NDEBUG
-    if (ConnectArg == "debug") {
-      DebugFlag = true;
-      ConnectArg = argv[FirstProgramArg++];
-    }
+  if (NextArg == "debug") {
+    DebugFlag = true;
+    NextArg = argv[FirstProgramArg++];
+  }
 #endif
 
-    while (ConnectArg.starts_with("test-")) {
-      TestOutputFlags.push_back(ConnectArg);
-      ConnectArg = argv[FirstProgramArg++];
-    }
-
-    StringRef SpecifierType, Specifier;
-    std::tie(SpecifierType, Specifier) = ConnectArg.split('=');
-    if (SpecifierType == "filedescs") {
-      StringRef FD1Str, FD2Str;
-      std::tie(FD1Str, FD2Str) = Specifier.split(',');
-      if (FD1Str.getAsInteger(10, InFD))
-        printErrorAndExit(FD1Str + " is not a valid file descriptor");
-      if (FD2Str.getAsInteger(10, OutFD))
-        printErrorAndExit(FD2Str + " is not a valid file descriptor");
-    } else if (SpecifierType == "listen") {
-      StringRef Host, PortStr;
-      std::tie(Host, PortStr) = Specifier.split(':');
-
-      int Port = 0;
-      if (PortStr.getAsInteger(10, Port))
-        printErrorAndExit("port number '" + PortStr +
-                          "' is not a valid integer");
-
-      InFD = OutFD = openListener(Host.str(), PortStr.str());
-    } else
-      printErrorAndExit("invalid specifier type \"" + SpecifierType + "\"");
+  std::vector<StringRef> TestOutputFlags;
+  while (NextArg.starts_with("test-")) {
+    TestOutputFlags.push_back(NextArg);
+    NextArg = argv[FirstProgramArg++];
   }
 
   if (llvm::is_contained(TestOutputFlags, "test-jitloadergdb"))
     fprintf(stderr, "__jit_debug_descriptor.last_entry = 0x%016" PRIx64 "\n",
             pointerToJITTargetAddress(findLastDebugDescriptorEntryPtr()));
 
+  StringRef SpecifierType, Specifier;
+  std::tie(SpecifierType, Specifier) = NextArg.split('=');
+  if (SpecifierType == "filedescs") {
+    StringRef FD1Str, FD2Str;
+    std::tie(FD1Str, FD2Str) = Specifier.split(',');
+    if (FD1Str.getAsInteger(10, InFD))
+      printErrorAndExit(FD1Str + " is not a valid file descriptor");
+    if (FD2Str.getAsInteger(10, OutFD))
+      printErrorAndExit(FD2Str + " is not a valid file descriptor");
+  } else if (SpecifierType == "listen") {
+    StringRef Host, PortStr;
+    std::tie(Host, PortStr) = Specifier.split(':');
+
+    int Port = 0;
+    if (PortStr.getAsInteger(10, Port))
+      printErrorAndExit("port number '" + PortStr + "' is not a valid integer");
+
+    InFD = OutFD = openListener(Host.str(), PortStr.str());
+  } else
+    printErrorAndExit("invalid specifier type \"" + SpecifierType + "\"");
+
   auto Server =
       ExitOnErr(SimpleRemoteEPCServer::Create<FDSimpleRemoteEPCTransport>(
           [](SimpleRemoteEPCServer::Setup &S) -> Error {

From ec7a231b30e5dcbaa2fe20ba9a793bffed4ea2cf Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Wed, 3 Jan 2024 14:28:52 +0000
Subject: [PATCH 138/313] [TLI] Use the VFABI demangling when declaring vector
 variants. (#76753)

When creating a declaration for a vector variant, in order to determine
the argument types we need to consult the VFABI demangler. This will
allow us to add TLI mappings with linear arguments (see #76060).
---
 .../Transforms/Utils/InjectTLIMappings.cpp    | 46 +++++++++----------
 llvm/test/Transforms/Util/add-TLI-mappings.ll | 26 +++++++++++
 2 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index 0990c750af55f..ea31356306658 100644
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -33,37 +33,37 @@ STATISTIC(NumVFDeclAdded,
 STATISTIC(NumCompUsedAdded,
           "Number of `@llvm.compiler.used` operands that have been added.");
 
-/// A helper function that adds the vector function declaration that
-/// vectorizes the CallInst CI with a vectorization factor of VF
-/// lanes. The TLI assumes that all parameters and the return type of
-/// CI (other than void) need to be widened to a VectorType of VF
-/// lanes.
+/// A helper function that adds the vector variant declaration for vectorizing
+/// the CallInst \p CI with a vectorization factor of \p VF lanes. For each
+/// mapping, TLI provides a VABI prefix, which contains all information required
+/// to create vector function declaration.
 static void addVariantDeclaration(CallInst &CI, const ElementCount &VF,
-                                  bool Predicate, const StringRef VFName) {
+                                  const VecDesc *VD) {
   Module *M = CI.getModule();
+  FunctionType *ScalarFTy = CI.getFunctionType();
 
-  // Add function declaration.
-  Type *RetTy = ToVectorTy(CI.getType(), VF);
-  SmallVector<Type *, 4> Tys;
-  for (Value *ArgOperand : CI.args())
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
-  assert(!CI.getFunctionType()->isVarArg() &&
-         "VarArg functions are not supported.");
-  if (Predicate)
-    Tys.push_back(ToVectorTy(Type::getInt1Ty(RetTy->getContext()), VF));
-  FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false);
-  Function *VectorF =
-      Function::Create(FTy, Function::ExternalLinkage, VFName, M);
-  VectorF->copyAttributesFrom(CI.getCalledFunction());
+  assert(!ScalarFTy->isVarArg() && "VarArg functions are not supported.");
+
+  const std::optional<VFInfo> Info = VFABI::tryDemangleForVFABI(
+      VD->getVectorFunctionABIVariantString(), ScalarFTy);
+
+  assert(Info && "Failed to demangle vector variant");
+  assert(Info->Shape.VF == VF && "Mangled name does not match VF");
+
+  const StringRef VFName = VD->getVectorFnName();
+  FunctionType *VectorFTy = VFABI::createFunctionType(*Info, ScalarFTy);
+  Function *VecFunc =
+      Function::Create(VectorFTy, Function::ExternalLinkage, VFName, M);
+  VecFunc->copyAttributesFrom(CI.getCalledFunction());
   ++NumVFDeclAdded;
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName
-                    << "` of type " << *(VectorF->getType()) << "\n");
+                    << "` of type " << *VectorFTy << "\n");
 
   // Make function declaration (without a body) "sticky" in the IR by
   // listing it in the @llvm.compiler.used intrinsic.
-  assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` "
+  assert(!VecFunc->size() && "VFABI attribute requires `@llvm.compiler.used` "
                              "only on declarations.");
-  appendToCompilerUsed(*M, {VectorF});
+  appendToCompilerUsed(*M, {VecFunc});
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName
                     << "` to `@llvm.compiler.used`.\n");
   ++NumCompUsedAdded;
@@ -100,7 +100,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
       }
       Function *VariantF = M->getFunction(VD->getVectorFnName());
       if (!VariantF)
-        addVariantDeclaration(CI, VF, Predicate, VD->getVectorFnName());
+        addVariantDeclaration(CI, VF, VD);
     }
   };
 
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index a40798665a50d..67ca00b8e2cf4 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -65,6 +65,32 @@ define float @call_llvm.log10.f32(float %in) {
 }
 
 declare float @llvm.log10.f32(float) #0
+
+; SVML: declare <2 x double> @__svml_sin2(<2 x double>)
+; SVML: declare <4 x double> @__svml_sin4(<4 x double>)
+; SVML: declare <8 x double> @__svml_sin8(<8 x double>)
+; SVML: declare <4 x float> @__svml_log10f4(<4 x float>)
+; SVML: declare <8 x float> @__svml_log10f8(<8 x float>)
+; SVML: declare <16 x float> @__svml_log10f16(<16 x float>)
+
+; MASSV: declare <2 x double> @__sind2(<2 x double>)
+; MASSV: declare <4 x float> @__log10f4(<4 x float>)
+
+; LIBMVEC-X86: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
+; LIBMVEC-X86: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
+
+; ACCELERATE: declare <4 x float> @vlog10f(<4 x float>)
+
+; SLEEFGNUABI: declare <2 x double> @_ZGVnN2v_sin(<2 x double>)
+; SLEEFGNUABI: declare <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double>, <vscale x 2 x i1>)
+; SLEEFGNUABI: declare <4 x float> @_ZGVnN4v_log10f(<4 x float>)
+; SLEEFGNUABI: declare <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+; ARMPL: declare <2 x double> @armpl_vsinq_f64(<2 x double>)
+; ARMPL: declare <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double>, <vscale x 2 x i1>)
+; ARMPL: declare <4 x float> @armpl_vlog10q_f32(<4 x float>)
+; ARMPL: declare <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float>, <vscale x 4 x i1>)
+
 attributes #0 = { nounwind readnone }
 
 ; SVML:      attributes #[[SIN]] = { "vector-function-abi-variant"=

From b3d26426b06ee74bf79f766375a37c384aa0132b Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Wed, 3 Jan 2024 09:31:29 -0500
Subject: [PATCH 139/313] NFC: Reflow comment for readability.

---
 clang/lib/AST/ASTContext.cpp | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0395b3e47ab6f..b60dcfaabfd1a 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2748,21 +2748,20 @@ bool ASTContext::hasUniqueObjectRepresentations(
     QualType Ty, bool CheckIfTriviallyCopyable) const {
   // C++17 [meta.unary.prop]:
   //   The predicate condition for a template specialization
-  //   has_unique_object_representations<T> shall be
-  //   satisfied if and only if:
+  //   has_unique_object_representations<T> shall be satisfied if and only if:
   //     (9.1) - T is trivially copyable, and
   //     (9.2) - any two objects of type T with the same value have the same
-  //     object representation, where two objects
-  //   of array or non-union class type are considered to have the same value
-  //   if their respective sequences of
-  //   direct subobjects have the same values, and two objects of union type
-  //   are considered to have the same
-  //   value if they have the same active member and the corresponding members
-  //   have the same value.
+  //     object representation, where:
+  //     - two objects of array or non-union class type are considered to have
+  //       the same value if their respective sequences of direct subobjects
+  //       have the same values, and
+  //     - two objects of union type are considered to have the same value if
+  //       they have the same active member and the corresponding members have
+  //       the same value.
   //   The set of scalar types for which this condition holds is
-  //   implementation-defined. [ Note: If a type has padding
-  //   bits, the condition does not hold; otherwise, the condition holds true
-  //   for unsigned integral types. -- end note ]
+  //   implementation-defined. [ Note: If a type has padding bits, the condition
+  //   does not hold; otherwise, the condition holds true for unsigned integral
+  //   types. -- end note ]
   assert(!Ty.isNull() && "Null QualType sent to unique object rep check");
 
   // Arrays are unique only if their element type is unique.

From 7837110ed8efdd510516c849178a7af28b93aea4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Wed, 3 Jan 2024 15:57:30 +0100
Subject: [PATCH 140/313] [clangd] Track IWYU pragmas for non-preamble includes
 (#75612)

This makes PragmaIncldues copyable, and copies it from preamble when
building a
new AST.

Fixes https://github.com/clangd/clangd/issues/1843
Fixes https://github.com/clangd/clangd/issues/1571
---
 clang-tools-extra/clangd/Hover.cpp            |  4 +--
 clang-tools-extra/clangd/IncludeCleaner.cpp   |  4 +--
 clang-tools-extra/clangd/ParsedAST.cpp        | 21 +++++++++-------
 clang-tools-extra/clangd/ParsedAST.h          |  9 +++----
 clang-tools-extra/clangd/XRefs.cpp            |  2 +-
 clang-tools-extra/clangd/index/FileIndex.cpp  |  2 +-
 .../clangd/unittests/FileIndexTests.cpp       | 12 ++++-----
 .../clangd/unittests/IncludeCleanerTests.cpp  |  2 ++
 clang-tools-extra/clangd/unittests/TestTU.cpp |  4 +--
 .../include/clang-include-cleaner/Record.h    |  3 ++-
 .../include-cleaner/lib/Record.cpp            |  7 +++---
 .../include-cleaner/unittests/RecordTest.cpp  | 25 +++++++++++++++++++
 12 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 82323fe16c82b..06b949bc4a2b5 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -1194,7 +1194,7 @@ void maybeAddSymbolProviders(ParsedAST &AST, HoverInfo &HI,
 
   const SourceManager &SM = AST.getSourceManager();
   llvm::SmallVector<include_cleaner::Header> RankedProviders =
-      include_cleaner::headersForSymbol(Sym, SM, AST.getPragmaIncludes().get());
+      include_cleaner::headersForSymbol(Sym, SM, &AST.getPragmaIncludes());
   if (RankedProviders.empty())
     return;
 
@@ -1254,7 +1254,7 @@ void maybeAddUsedSymbols(ParsedAST &AST, HoverInfo &HI, const Inclusion &Inc) {
   llvm::DenseSet<include_cleaner::Symbol> UsedSymbols;
   include_cleaner::walkUsed(
       AST.getLocalTopLevelDecls(), collectMacroReferences(AST),
-      AST.getPragmaIncludes().get(), AST.getPreprocessor(),
+      &AST.getPragmaIncludes(), AST.getPreprocessor(),
       [&](const include_cleaner::SymbolReference &Ref,
           llvm::ArrayRef<include_cleaner::Header> Providers) {
         if (Ref.RT != include_cleaner::RefType::Explicit ||
diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index 2f34c94934920..563a1f5d22f5b 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -311,7 +311,7 @@ getUnused(ParsedAST &AST,
     auto IncludeID = static_cast<IncludeStructure::HeaderID>(*MFI.HeaderID);
     if (ReferencedFiles.contains(IncludeID))
       continue;
-    if (!mayConsiderUnused(MFI, AST, AST.getPragmaIncludes().get())) {
+    if (!mayConsiderUnused(MFI, AST, &AST.getPragmaIncludes())) {
       dlog("{0} was not used, but is not eligible to be diagnosed as unused",
            MFI.Written);
       continue;
@@ -403,7 +403,7 @@ IncludeCleanerFindings computeIncludeCleanerFindings(ParsedAST &AST) {
                                               .getBuiltinDir();
   include_cleaner::walkUsed(
       AST.getLocalTopLevelDecls(), /*MacroRefs=*/Macros,
-      AST.getPragmaIncludes().get(), AST.getPreprocessor(),
+      &AST.getPragmaIncludes(), AST.getPreprocessor(),
       [&](const include_cleaner::SymbolReference &Ref,
           llvm::ArrayRef<include_cleaner::Header> Providers) {
         bool Satisfied = false;
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index d91ce7283ecee..14a91797f4d2e 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -653,6 +653,7 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
   }
 
   IncludeStructure Includes;
+  include_cleaner::PragmaIncludes PI;
   // If we are using a preamble, copy existing includes.
   if (Preamble) {
     Includes = Preamble->Includes;
@@ -660,11 +661,15 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     // Replay the preamble includes so that clang-tidy checks can see them.
     ReplayPreamble::attach(Patch->preambleIncludes(), *Clang,
                            Patch->modifiedBounds());
+    PI = *Preamble->Pragmas;
   }
   // Important: collectIncludeStructure is registered *after* ReplayPreamble!
   // Otherwise we would collect the replayed includes again...
   // (We can't *just* use the replayed includes, they don't have Resolved path).
   Includes.collect(*Clang);
+  // Same for pragma-includes, we're already inheriting preamble includes, so we
+  // should only receive callbacks for non-preamble mainfile includes.
+  PI.record(*Clang);
   // Copy over the macros in the preamble region of the main file, and combine
   // with non-preamble macros below.
   MainFileMacros Macros;
@@ -735,7 +740,7 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
   ParsedAST Result(Filename, Inputs.Version, std::move(Preamble),
                    std::move(Clang), std::move(Action), std::move(Tokens),
                    std::move(Macros), std::move(Marks), std::move(ParsedDecls),
-                   std::move(Diags), std::move(Includes));
+                   std::move(Diags), std::move(Includes), std::move(PI));
   llvm::move(getIncludeCleanerDiags(Result, Inputs.Contents),
              std::back_inserter(Result.Diags));
   return std::move(Result);
@@ -828,23 +833,21 @@ ParsedAST::ParsedAST(PathRef TUPath, llvm::StringRef Version,
                      syntax::TokenBuffer Tokens, MainFileMacros Macros,
                      std::vector<PragmaMark> Marks,
                      std::vector<Decl *> LocalTopLevelDecls,
-                     std::vector<Diag> Diags, IncludeStructure Includes)
+                     std::vector<Diag> Diags, IncludeStructure Includes,
+                     include_cleaner::PragmaIncludes PI)
     : TUPath(TUPath), Version(Version), Preamble(std::move(Preamble)),
       Clang(std::move(Clang)), Action(std::move(Action)),
       Tokens(std::move(Tokens)), Macros(std::move(Macros)),
       Marks(std::move(Marks)), Diags(std::move(Diags)),
       LocalTopLevelDecls(std::move(LocalTopLevelDecls)),
-      Includes(std::move(Includes)) {
-  Resolver = std::make_unique<HeuristicResolver>(getASTContext());
+      Includes(std::move(Includes)), PI(std::move(PI)),
+      Resolver(std::make_unique<HeuristicResolver>(getASTContext())) {
   assert(this->Clang);
   assert(this->Action);
 }
 
-std::shared_ptr<const include_cleaner::PragmaIncludes>
-ParsedAST::getPragmaIncludes() const {
-  if (!Preamble)
-    return nullptr;
-  return Preamble->Pragmas;
+const include_cleaner::PragmaIncludes &ParsedAST::getPragmaIncludes() const {
+  return PI;
 }
 
 std::optional<llvm::StringRef> ParsedAST::preambleVersion() const {
diff --git a/clang-tools-extra/clangd/ParsedAST.h b/clang-tools-extra/clangd/ParsedAST.h
index c68fdba6bd26c..63e564bd68a78 100644
--- a/clang-tools-extra/clangd/ParsedAST.h
+++ b/clang-tools-extra/clangd/ParsedAST.h
@@ -103,10 +103,8 @@ class ParsedAST {
   /// Tokens recorded while parsing the main file.
   /// (!) does not have tokens from the preamble.
   const syntax::TokenBuffer &getTokens() const { return Tokens; }
-  /// Returns the PramaIncludes from the preamble.
-  /// Might be null if AST is built without a preamble.
-  std::shared_ptr<const include_cleaner::PragmaIncludes>
-  getPragmaIncludes() const;
+  /// Returns the PramaIncludes for preamble + main file includes.
+  const include_cleaner::PragmaIncludes &getPragmaIncludes() const;
 
   /// Returns the version of the ParseInputs this AST was built from.
   llvm::StringRef version() const { return Version; }
@@ -129,7 +127,7 @@ class ParsedAST {
             std::unique_ptr<FrontendAction> Action, syntax::TokenBuffer Tokens,
             MainFileMacros Macros, std::vector<PragmaMark> Marks,
             std::vector<Decl *> LocalTopLevelDecls, std::vector<Diag> Diags,
-            IncludeStructure Includes);
+            IncludeStructure Includes, include_cleaner::PragmaIncludes PI);
   Path TUPath;
   std::string Version;
   // In-memory preambles must outlive the AST, it is important that this member
@@ -159,6 +157,7 @@ class ParsedAST {
   // top-level decls from the preamble.
   std::vector<Decl *> LocalTopLevelDecls;
   IncludeStructure Includes;
+  include_cleaner::PragmaIncludes PI;
   std::unique_ptr<HeuristicResolver> Resolver;
 };
 
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index f55d2a5623956..5f41f788a6939 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -1339,7 +1339,7 @@ maybeFindIncludeReferences(ParsedAST &AST, Position Pos,
   auto Converted = convertIncludes(AST);
   include_cleaner::walkUsed(
       AST.getLocalTopLevelDecls(), collectMacroReferences(AST),
-      AST.getPragmaIncludes().get(), AST.getPreprocessor(),
+      &AST.getPragmaIncludes(), AST.getPreprocessor(),
       [&](const include_cleaner::SymbolReference &Ref,
           llvm::ArrayRef<include_cleaner::Header> Providers) {
         if (Ref.RT != include_cleaner::RefType::Explicit ||
diff --git a/clang-tools-extra/clangd/index/FileIndex.cpp b/clang-tools-extra/clangd/index/FileIndex.cpp
index 5052c661cfc5e..eb9562d2b6bf8 100644
--- a/clang-tools-extra/clangd/index/FileIndex.cpp
+++ b/clang-tools-extra/clangd/index/FileIndex.cpp
@@ -223,7 +223,7 @@ FileShardedIndex::getShard(llvm::StringRef Uri) const {
 SlabTuple indexMainDecls(ParsedAST &AST) {
   return indexSymbols(
       AST.getASTContext(), AST.getPreprocessor(), AST.getLocalTopLevelDecls(),
-      &AST.getMacros(), *AST.getPragmaIncludes(),
+      &AST.getMacros(), AST.getPragmaIncludes(),
       /*IsIndexMainAST=*/true, AST.version(), /*CollectMainFileRefs=*/true);
 }
 
diff --git a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
index cf30b388d234d..9f713564b2c01 100644
--- a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
@@ -176,7 +176,7 @@ void update(FileIndex &M, llvm::StringRef Basename, llvm::StringRef Code) {
   auto AST = File.build();
   M.updatePreamble(testPath(File.Filename), /*Version=*/"null",
                    AST.getASTContext(), AST.getPreprocessor(),
-                   *AST.getPragmaIncludes());
+                   AST.getPragmaIncludes());
 }
 
 TEST(FileIndexTest, CustomizedURIScheme) {
@@ -254,7 +254,7 @@ TEST(FileIndexTest, IWYUPragmaExport) {
   auto AST = File.build();
   M.updatePreamble(testPath(File.Filename), /*Version=*/"null",
                    AST.getASTContext(), AST.getPreprocessor(),
-                   *AST.getPragmaIncludes());
+                   AST.getPragmaIncludes());
 
   auto Symbols = runFuzzyFind(M, "");
   EXPECT_THAT(
@@ -446,7 +446,7 @@ TEST(FileIndexTest, Relations) {
   FileIndex Index;
   Index.updatePreamble(testPath(TU.Filename), /*Version=*/"null",
                        AST.getASTContext(), AST.getPreprocessor(),
-                       *AST.getPragmaIncludes());
+                       AST.getPragmaIncludes());
   SymbolID A = findSymbol(TU.headerSymbols(), "A").ID;
   uint32_t Results = 0;
   RelationsRequest Req;
@@ -567,7 +567,7 @@ TEST(FileIndexTest, StalePreambleSymbolsDeleted) {
   auto AST = File.build();
   M.updatePreamble(testPath(File.Filename), /*Version=*/"null",
                    AST.getASTContext(), AST.getPreprocessor(),
-                   *AST.getPragmaIncludes());
+                   AST.getPragmaIncludes());
   EXPECT_THAT(runFuzzyFind(M, ""), UnorderedElementsAre(qName("a")));
 
   File.Filename = "f2.cpp";
@@ -575,7 +575,7 @@ TEST(FileIndexTest, StalePreambleSymbolsDeleted) {
   AST = File.build();
   M.updatePreamble(testPath(File.Filename), /*Version=*/"null",
                    AST.getASTContext(), AST.getPreprocessor(),
-                   *AST.getPragmaIncludes());
+                   AST.getPragmaIncludes());
   EXPECT_THAT(runFuzzyFind(M, ""), UnorderedElementsAre(qName("b")));
 }
 
@@ -720,7 +720,7 @@ TEST(FileIndexTest, Profile) {
   auto AST = TestTU::withHeaderCode("int a;").build();
   FI.updateMain(FileName, AST);
   FI.updatePreamble(FileName, "v1", AST.getASTContext(), AST.getPreprocessor(),
-                    *AST.getPragmaIncludes());
+                    AST.getPragmaIncludes());
 
   llvm::BumpPtrAllocator Alloc;
   MemoryTree MT(&Alloc);
diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
index 5a6524dec2f09..7ed4a9103e1f2 100644
--- a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
@@ -316,8 +316,10 @@ TEST(IncludeCleaner, IWYUPragmas) {
     #include "public.h"
 
     void bar() { foo(); }
+    #include "keep_main_file.h" // IWYU pragma: keep
     )cpp";
   TU.AdditionalFiles["behind_keep.h"] = guard("");
+  TU.AdditionalFiles["keep_main_file.h"] = guard("");
   TU.AdditionalFiles["exported.h"] = guard("");
   TU.AdditionalFiles["public.h"] = guard("#include \"private.h\"");
   TU.AdditionalFiles["private.h"] = guard(R"cpp(
diff --git a/clang-tools-extra/clangd/unittests/TestTU.cpp b/clang-tools-extra/clangd/unittests/TestTU.cpp
index e65ae825b4167..1f02c04125b1e 100644
--- a/clang-tools-extra/clangd/unittests/TestTU.cpp
+++ b/clang-tools-extra/clangd/unittests/TestTU.cpp
@@ -164,7 +164,7 @@ SymbolSlab TestTU::headerSymbols() const {
   auto AST = build();
   return std::get<0>(indexHeaderSymbols(
       /*Version=*/"null", AST.getASTContext(), AST.getPreprocessor(),
-      *AST.getPragmaIncludes()));
+      AST.getPragmaIncludes()));
 }
 
 RefSlab TestTU::headerRefs() const {
@@ -177,7 +177,7 @@ std::unique_ptr<SymbolIndex> TestTU::index() const {
   auto Idx = std::make_unique<FileIndex>();
   Idx->updatePreamble(testPath(Filename), /*Version=*/"null",
                       AST.getASTContext(), AST.getPreprocessor(),
-                      *AST.getPragmaIncludes());
+                      AST.getPragmaIncludes());
   Idx->updateMain(testPath(Filename), AST);
   return std::move(Idx);
 }
diff --git a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Record.h b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Record.h
index 2e0b462ce16df..2dcb5ea2555c5 100644
--- a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Record.h
+++ b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Record.h
@@ -113,7 +113,8 @@ class PragmaIncludes {
   llvm::DenseSet<llvm::sys::fs::UniqueID> ShouldKeep;
 
   /// Owns the strings.
-  llvm::BumpPtrAllocator Arena;
+  /// Each record() pushes a new one, while keeping all the old strings alive.
+  std::vector<std::shared_ptr<const llvm::BumpPtrAllocator>> Arena;
 
   // FIXME: add support for clang use_instead pragma
 };
diff --git a/clang-tools-extra/include-cleaner/lib/Record.cpp b/clang-tools-extra/include-cleaner/lib/Record.cpp
index bd726cff12a97..c93c56adf650d 100644
--- a/clang-tools-extra/include-cleaner/lib/Record.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Record.cpp
@@ -178,7 +178,8 @@ class PragmaIncludes::RecordPragma : public PPCallbacks, public CommentHandler {
       : RecordPragma(CI.getPreprocessor(), Out) {}
   RecordPragma(const Preprocessor &P, PragmaIncludes *Out)
       : SM(P.getSourceManager()), HeaderInfo(P.getHeaderSearchInfo()), Out(Out),
-        UniqueStrings(Arena) {}
+        Arena(std::make_shared<llvm::BumpPtrAllocator>()),
+        UniqueStrings(*Arena) {}
 
   void FileChanged(SourceLocation Loc, FileChangeReason Reason,
                    SrcMgr::CharacteristicKind FileType,
@@ -204,7 +205,7 @@ class PragmaIncludes::RecordPragma : public PPCallbacks, public CommentHandler {
           std::unique(It.getSecond().begin(), It.getSecond().end()),
           It.getSecond().end());
     }
-    Out->Arena = std::move(Arena);
+    Out->Arena.emplace_back(std::move(Arena));
   }
 
   void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
@@ -336,7 +337,7 @@ class PragmaIncludes::RecordPragma : public PPCallbacks, public CommentHandler {
   const SourceManager &SM;
   const HeaderSearch &HeaderInfo;
   PragmaIncludes *Out;
-  llvm::BumpPtrAllocator Arena;
+  std::shared_ptr<llvm::BumpPtrAllocator> Arena;
   /// Intern table for strings. Contents are on the arena.
   llvm::StringSaver UniqueStrings;
 
diff --git a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
index 0f2ded5f18345..d1f7590b22268 100644
--- a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
@@ -588,5 +588,30 @@ TEST_F(PragmaIncludeTest, OutlivesFMAndSM) {
   EXPECT_THAT(PI.getExporters(Private2FE.get(), FM),
               testing::ElementsAre(llvm::cantFail(FM.getFileRef("public.h"))));
 }
+
+TEST_F(PragmaIncludeTest, CanRecordManyTimes) {
+  Inputs.Code = R"cpp(
+    #include "public.h"
+  )cpp";
+  Inputs.ExtraFiles["public.h"] = R"cpp(
+    #include "private.h"
+  )cpp";
+  Inputs.ExtraFiles["private.h"] = R"cpp(
+    // IWYU pragma: private, include "public.h"
+  )cpp";
+
+  TestAST Processed = build();
+  auto &FM = Processed.fileManager();
+  auto PrivateFE = FM.getFile("private.h");
+  llvm::StringRef Public = PI.getPublic(PrivateFE.get());
+  EXPECT_EQ(Public, "\"public.h\"");
+
+  // This build populates same PI during build, but this time we don't have
+  // any IWYU pragmas. Make sure strings from previous recordings are still
+  // alive.
+  Inputs.Code = "";
+  build();
+  EXPECT_EQ(Public, "\"public.h\"");
+}
 } // namespace
 } // namespace clang::include_cleaner

From 329ba523ccbbe68a12434926c92fd9a86494d958 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 3 Jan 2024 07:19:56 -0800
Subject: [PATCH 141/313] [LTO][NFC] Free the GlobalResolutions map after final
 use (#76780)

The GlobalResolutions map was found to contribute ~9% of the peak
memory of a large thin link. However, we are essentially done with it
when we are about to compute cross module imports, which itself adds to
the peak memory due to the import and export lists (there is one use
just after importing but it can easily be moved before importing).

Move the last use up above importing, and free the GlobalResolutions
map after that (and before importing). To help guard against future
inadvertent use after it has been released, change it to a
std::optional.
---
 llvm/include/llvm/LTO/LTO.h |  4 +++-
 llvm/lib/LTO/LTO.cpp        | 36 +++++++++++++++++++++++-------------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index be85c40983475..1050f24161fb9 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -404,7 +404,9 @@ class LTO {
   };
 
   // Global mapping from mangled symbol names to resolutions.
-  StringMap<GlobalResolution> GlobalResolutions;
+  // Make this an optional to guard against accessing after it has been reset
+  // (to reduce memory after we're done with it).
+  std::optional<StringMap<GlobalResolution>> GlobalResolutions;
 
   void addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
                             ArrayRef<SymbolResolution> Res, unsigned Partition,
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 05836fd28f528..6a1e53b96998c 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -592,7 +592,9 @@ LTO::LTO(Config Conf, ThinBackend Backend,
          unsigned ParallelCodeGenParallelismLevel, LTOKind LTOMode)
     : Conf(std::move(Conf)),
       RegularLTO(ParallelCodeGenParallelismLevel, this->Conf),
-      ThinLTO(std::move(Backend)), LTOMode(LTOMode) {}
+      ThinLTO(std::move(Backend)),
+      GlobalResolutions(std::make_optional<StringMap<GlobalResolution>>()),
+      LTOMode(LTOMode) {}
 
 // Requires a destructor for MapVector<BitcodeModule>.
 LTO::~LTO() = default;
@@ -610,7 +612,7 @@ void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
 
-    auto &GlobalRes = GlobalResolutions[Sym.getName()];
+    auto &GlobalRes = (*GlobalResolutions)[Sym.getName()];
     GlobalRes.UnnamedAddr &= Sym.isUnnamedAddr();
     if (Res.Prevailing) {
       assert(!GlobalRes.Prevailing &&
@@ -1125,7 +1127,7 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
   // Compute "dead" symbols, we don't want to import/export these!
   DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
   DenseMap<GlobalValue::GUID, PrevailingType> GUIDPrevailingResolutions;
-  for (auto &Res : GlobalResolutions) {
+  for (auto &Res : *GlobalResolutions) {
     // Normally resolution have IR name of symbol. We can do nothing here
     // otherwise. See comments in GlobalResolution struct for more details.
     if (Res.second.IRName.empty())
@@ -1169,6 +1171,8 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
 
   Error Result = runRegularLTO(AddStream);
   if (!Result)
+    // This will reset the GlobalResolutions optional once done with it to
+    // reduce peak memory before importing.
     Result = runThinLTO(AddStream, Cache, GUIDPreservedSymbols);
 
   if (StatsFile)
@@ -1273,8 +1277,8 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
   // This returns true when the name is local or not defined. Locals are
   // expected to be handled separately.
   auto IsVisibleToRegularObj = [&](StringRef name) {
-    auto It = GlobalResolutions.find(name);
-    return (It == GlobalResolutions.end() || It->second.VisibleOutsideSummary);
+    auto It = GlobalResolutions->find(name);
+    return (It == GlobalResolutions->end() || It->second.VisibleOutsideSummary);
   };
 
   // If allowed, upgrade public vcall visibility metadata to linkage unit
@@ -1291,7 +1295,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
   if (!Conf.CodeGenOnly) {
-    for (const auto &R : GlobalResolutions) {
+    for (const auto &R : *GlobalResolutions) {
       GlobalValue *GV =
           RegularLTO.CombinedModule->getNamedValue(R.second.IRName);
       if (!R.second.isPrevailingIRSymbol())
@@ -1708,8 +1712,8 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
     // This returns true when the name is local or not defined. Locals are
     // expected to be handled separately.
     auto IsVisibleToRegularObj = [&](StringRef name) {
-      auto It = GlobalResolutions.find(name);
-      return (It == GlobalResolutions.end() ||
+      auto It = GlobalResolutions->find(name);
+      return (It == GlobalResolutions->end() ||
               It->second.VisibleOutsideSummary);
     };
 
@@ -1739,15 +1743,11 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
     ContextDisambiguation.run(ThinLTO.CombinedIndex, isPrevailing);
   }
 
-  if (Conf.OptLevel > 0)
-    ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                             isPrevailing, ImportLists, ExportLists);
-
   // Figure out which symbols need to be internalized. This also needs to happen
   // at -O0 because summary-based DCE is implemented using internalization, and
   // we must apply DCE consistently with the full LTO module in order to avoid
   // undefined references during the final link.
-  for (auto &Res : GlobalResolutions) {
+  for (auto &Res : *GlobalResolutions) {
     // If the symbol does not have external references or it is not prevailing,
     // then not need to mark it as exported from a ThinLTO partition.
     if (Res.second.Partition != GlobalResolution::External ||
@@ -1760,6 +1760,16 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
       ExportedGUIDs.insert(GUID);
   }
 
+  // Reset the GlobalResolutions to deallocate the associated memory, as there
+  // are no further accesses. We specifically want to do this before computing
+  // cross module importing, which adds to peak memory via the computed import
+  // and export lists.
+  GlobalResolutions.reset();
+
+  if (Conf.OptLevel > 0)
+    ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                             isPrevailing, ImportLists, ExportLists);
+
   // Any functions referenced by the jump table in the regular LTO object must
   // be exported.
   for (auto &Def : ThinLTO.CombinedIndex.cfiFunctionDefs())

From 82e33d6203f62c0be16f8da8d4178e7fa9e790ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= <Mirko.Brkusanin@amd.com>
Date: Wed, 3 Jan 2024 16:32:00 +0100
Subject: [PATCH 142/313] [AMDGPU] Add VDSDIR instructions for GFX12 (#75197)

---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  16 ++
 llvm/lib/Target/AMDGPU/DSDIRInstructions.td   | 191 ++++++++++++++++
 llvm/lib/Target/AMDGPU/LDSDIRInstructions.td  | 116 ----------
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |  14 ++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |   4 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   2 +-
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    |   4 +-
 .../AMDGPU/llvm.amdgcn.lds.direct.load.ll     |  36 +--
 .../AMDGPU/llvm.amdgcn.lds.param.load.ll      |  39 ++--
 llvm/test/MC/AMDGPU/gfx12_asm_vdsdir.s        | 199 +++++++++++++++++
 .../Disassembler/AMDGPU/gfx12_dasm_vdsdir.txt | 206 ++++++++++++++++++
 12 files changed, 679 insertions(+), 150 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/DSDIRInstructions.td
 delete mode 100644 llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vdsdir.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vdsdir.txt

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index abd7e911beef3..5f2b7c0b4b4ef 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -166,6 +166,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     ImmTyEndpgm,
     ImmTyWaitVDST,
     ImmTyWaitEXP,
+    ImmTyWaitVAVDst,
+    ImmTyWaitVMVSrc,
   };
 
   // Immediate operand kind.
@@ -909,6 +911,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   bool isEndpgm() const;
   bool isWaitVDST() const;
   bool isWaitEXP() const;
+  bool isWaitVAVDst() const;
+  bool isWaitVMVSrc() const;
 
   auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const {
     return std::bind(P, *this);
@@ -1029,6 +1033,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   }
 
   static void printImmTy(raw_ostream& OS, ImmTy Type) {
+    // clang-format off
     switch (Type) {
     case ImmTyNone: OS << "None"; break;
     case ImmTyGDS: OS << "GDS"; break;
@@ -1086,7 +1091,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     case ImmTyEndpgm: OS << "Endpgm"; break;
     case ImmTyWaitVDST: OS << "WaitVDST"; break;
     case ImmTyWaitEXP: OS << "WaitEXP"; break;
+    case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
+    case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
     }
+    // clang-format on
   }
 
   void print(raw_ostream &OS) const override {
@@ -9192,6 +9200,14 @@ bool AMDGPUOperand::isWaitVDST() const {
   return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());
 }
 
+bool AMDGPUOperand::isWaitVAVDst() const {
+  return isImmTy(ImmTyWaitVAVDst) && isUInt<4>(getImm());
+}
+
+bool AMDGPUOperand::isWaitVMVSrc() const {
+  return isImmTy(ImmTyWaitVMVSrc) && isUInt<1>(getImm());
+}
+
 //===----------------------------------------------------------------------===//
 // VINTERP
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
new file mode 100644
index 0000000000000..54ef785cc608c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
@@ -0,0 +1,191 @@
+//===-- DSDIRInstructions.td - LDS/VDS Direct Instruction Definitions -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LDSDIR/VDSDIR encoding (LDSDIR is gfx11, VDSDIR is gfx12+)
+//===----------------------------------------------------------------------===//
+
+class LDSDIRe<bits<2> op, bit is_direct> : Enc32 {
+  // encoding fields
+  bits<2> attrchan;
+  bits<6> attr;
+  bits<4> waitvdst;
+  bits<8> vdst;
+
+  // encoding
+  let Inst{31-24} = 0xce; // encoding
+  let Inst{23-22} = 0x0; // reserved
+  let Inst{21-20} = op;
+  let Inst{19-16} = waitvdst;
+  let Inst{15-10} = !if(is_direct, ?, attr);
+  let Inst{9-8} = !if(is_direct, ?, attrchan);
+  let Inst{7-0} = vdst;
+}
+
+class VDSDIRe<bits<2> op, bit is_direct> : Enc32 {
+  // encoding fields
+  bits<2> attrchan;
+  bits<6> attr;
+  bits<4> waitvdst;
+  bits<8> vdst;
+  bits<1> waitvsrc;
+
+  // encoding
+  let Inst{31-24} = 0xce; // encoding
+  let Inst{23} = waitvsrc;
+  let Inst{22} = 0x0; // reserved
+  let Inst{21-20} = op;
+  let Inst{19-16} = waitvdst;
+  let Inst{15-10} = !if(is_direct, ?, attr);
+  let Inst{9-8} = !if(is_direct, ?, attrchan);
+  let Inst{7-0} = vdst;
+}
+
+//===----------------------------------------------------------------------===//
+// LDSDIR/VDSDIR Classes
+//===----------------------------------------------------------------------===//
+
+class LDSDIR_getIns<bit direct> {
+  dag ret = !if(direct,
+    (ins wait_vdst:$waitvdst),
+    (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst)
+  );
+}
+
+class VDSDIR_getIns<bit direct> {
+  dag ret = !if(direct,
+    (ins wait_va_vdst:$waitvdst, wait_va_vsrc:$waitvsrc),
+    (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_va_vdst:$waitvdst,
+         wait_va_vsrc:$waitvsrc)
+  );
+}
+
+class DSDIR_Common<string opName, string asm = "", dag ins, bit direct> :
+  InstSI<(outs VGPR_32:$vdst), ins, asm> {
+  let LDSDIR = 1;
+  let EXP_CNT = 1;
+
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+
+  string Mnemonic = opName;
+  let UseNamedOperandTable = 1;
+
+  let Uses = [M0, EXEC];
+  let DisableWQM = 0;
+  let SchedRW = [WriteLDS];
+
+  bit is_direct;
+  let is_direct = direct;
+}
+
+class DSDIR_Pseudo<string opName, dag ins, bit direct> :
+  DSDIR_Common<opName, "", ins, direct>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class LDSDIR_getAsm<bit direct> {
+  string ret = !if(direct,
+    " $vdst$waitvdst",
+    " $vdst, $attr$attrchan$waitvdst"
+  );
+}
+
+class VDSDIR_getAsm<bit direct> {
+  string ret = !if(direct,
+    " $vdst$waitvdst$waitvsrc",
+    " $vdst, $attr$attrchan$waitvdst$waitvsrc"
+  );
+}
+
+class DSDIR_Real<DSDIR_Pseudo lds, dag ins, string asm, int subtarget> :
+  DSDIR_Common<lds.Mnemonic,
+               lds.Mnemonic # asm,
+               ins,
+               lds.is_direct>,
+  SIMCInstr <lds.Mnemonic, subtarget> {
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// LDS/VDS Direct Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGFX11Only in {
+
+def LDS_DIRECT_LOAD : DSDIR_Pseudo<"lds_direct_load", LDSDIR_getIns<1>.ret, 1>;
+def LDS_PARAM_LOAD : DSDIR_Pseudo<"lds_param_load", LDSDIR_getIns<0>.ret, 0>;
+
+def : GCNPat <
+  (f32 (int_amdgcn_lds_direct_load M0)),
+  (LDS_DIRECT_LOAD 0)
+>;
+
+def : GCNPat <
+  (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
+  (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0)
+>;
+
+} // End SubtargetPredicate = isGFX11Only
+
+let SubtargetPredicate = isGFX12Plus in {
+
+def DS_DIRECT_LOAD : DSDIR_Pseudo<"ds_direct_load", VDSDIR_getIns<1>.ret, 1>;
+def DS_PARAM_LOAD : DSDIR_Pseudo<"ds_param_load", VDSDIR_getIns<0>.ret, 0>;
+
+def : GCNPat <
+  (f32 (int_amdgcn_lds_direct_load M0)),
+  (DS_DIRECT_LOAD 0, 1)
+>;
+
+def : GCNPat <
+  (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
+  (DS_PARAM_LOAD timm:$attr, timm:$attrchan, 0, 1)
+>;
+
+} // End SubtargetPredicate = isGFX12Only
+
+//===----------------------------------------------------------------------===//
+// GFX11
+//===----------------------------------------------------------------------===//
+
+multiclass DSDIR_Real_gfx11<bits<2> op,
+                            DSDIR_Pseudo lds = !cast<DSDIR_Pseudo>(NAME)> {
+  def _gfx11 : DSDIR_Real<lds, lds.InOperandList,
+                          LDSDIR_getAsm<lds.is_direct>.ret,
+                          SIEncodingFamily.GFX11>,
+               LDSDIRe<op, lds.is_direct> {
+    let AssemblerPredicate = isGFX11Only;
+    let DecoderNamespace = "GFX11";
+  }
+}
+
+defm LDS_PARAM_LOAD : DSDIR_Real_gfx11<0x0>;
+defm LDS_DIRECT_LOAD : DSDIR_Real_gfx11<0x1>;
+
+//===----------------------------------------------------------------------===//
+// GFX12+
+//===----------------------------------------------------------------------===//
+
+multiclass DSDIR_Real_gfx12<bits<2> op,
+                            DSDIR_Pseudo lds = !cast<DSDIR_Pseudo>(NAME)> {
+  def _gfx12 : DSDIR_Real<lds, lds.InOperandList,
+                          VDSDIR_getAsm<lds.is_direct>.ret,
+                          SIEncodingFamily.GFX12>,
+               VDSDIRe<op, lds.is_direct> {
+    let AssemblerPredicate = isGFX12Plus;
+    let DecoderNamespace = "GFX12";
+  }
+}
+
+defm DS_PARAM_LOAD : DSDIR_Real_gfx12<0x0>;
+defm DS_DIRECT_LOAD : DSDIR_Real_gfx12<0x1>;
diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
deleted file mode 100644
index 4956a15867749..0000000000000
--- a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
+++ /dev/null
@@ -1,116 +0,0 @@
-//===-- LDSDIRInstructions.td - LDS Direct Instruction Definitions --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// LDSDIR encoding
-//===----------------------------------------------------------------------===//
-
-class LDSDIRe<bits<2> op, bit is_direct> : Enc32 {
-  // encoding fields
-  bits<2> attrchan;
-  bits<6> attr;
-  bits<4> waitvdst;
-  bits<8> vdst;
-
-  // encoding
-  let Inst{31-24} = 0xce; // encoding
-  let Inst{23-22} = 0x0; // reserved
-  let Inst{21-20} = op;
-  let Inst{19-16} = waitvdst;
-  let Inst{15-10} = !if(is_direct, ?, attr);
-  let Inst{9-8} = !if(is_direct, ?, attrchan);
-  let Inst{7-0} = vdst;
-}
-
-//===----------------------------------------------------------------------===//
-// LDSDIR Classes
-//===----------------------------------------------------------------------===//
-
-class LDSDIR_getIns<bit direct> {
-  dag ret = !if(direct,
-    (ins wait_vdst:$waitvdst),
-    (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst)
-  );
-}
-
-class LDSDIR_Common<string opName, string asm = "", bit direct> : InstSI<
-    (outs VGPR_32:$vdst),
-    LDSDIR_getIns<direct>.ret,
-    asm> {
-  let LDSDIR = 1;
-  let EXP_CNT = 1;
-
-  let hasSideEffects = 0;
-  let mayLoad = 1;
-  let mayStore = 0;
-
-  string Mnemonic = opName;
-  let UseNamedOperandTable = 1;
-
-  let Uses = [M0, EXEC];
-  let DisableWQM = 0;
-  let SchedRW = [WriteLDS];
-
-  bit is_direct;
-  let is_direct = direct;
-}
-
-class LDSDIR_Pseudo<string opName, bit direct> :
-  LDSDIR_Common<opName, "", direct>,
-  SIMCInstr<opName, SIEncodingFamily.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class LDSDIR_getAsm<bit direct> {
-  string ret = !if(direct,
-    " $vdst$waitvdst",
-    " $vdst, $attr$attrchan$waitvdst"
-  );
-}
-
-class LDSDIR_Real<bits<2> op, LDSDIR_Pseudo lds, int subtarget> :
-  LDSDIR_Common<lds.Mnemonic,
-                lds.Mnemonic # LDSDIR_getAsm<lds.is_direct>.ret,
-                lds.is_direct>,
-  SIMCInstr <lds.Mnemonic, subtarget>,
-  LDSDIRe<op, lds.is_direct> {
-  let isPseudo = 0;
-  let isCodeGenOnly = 0;
-}
-
-//===----------------------------------------------------------------------===//
-// LDS Direct Instructions
-//===----------------------------------------------------------------------===//
-
-def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>;
-def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>;
-
-def : GCNPat <
-  (f32 (int_amdgcn_lds_direct_load M0)),
-  (LDS_DIRECT_LOAD 0)
->;
-
-def : GCNPat <
-  (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
-  (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0)
->;
-
-//===----------------------------------------------------------------------===//
-// GFX11+
-//===----------------------------------------------------------------------===//
-
-multiclass LDSDIR_Real_gfx11<bits<2> op, LDSDIR_Pseudo lds = !cast<LDSDIR_Pseudo>(NAME)> {
-  def _gfx11 : LDSDIR_Real<op, lds, SIEncodingFamily.GFX11> {
-    let AssemblerPredicate = isGFX11Plus;
-    let DecoderNamespace = "GFX11";
-  }
-}
-
-defm LDS_PARAM_LOAD : LDSDIR_Real_gfx11<0x0>;
-defm LDS_DIRECT_LOAD : LDSDIR_Real_gfx11<0x1>;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index edc244db613d8..ef1b85f58d0ee 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -639,6 +639,20 @@ void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
   printU4ImmDecOperand(MI, OpNo, O);
 }
 
+void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  O << " wait_va_vdst:";
+  printU4ImmDecOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  O << " wait_vm_vsrc:";
+  printU4ImmDecOperand(MI, OpNo, O);
+}
+
 void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI,
                                     raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 95c26de6299ef..f2f985fa5b1a8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -161,6 +161,10 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                     raw_ostream &O);
   void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
+  void printWaitVAVDst(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O, unsigned N);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 173c877b8d29e..50724fd3c1cf1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1144,6 +1144,8 @@ def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
 
 def wait_vdst : NamedIntOperand<i8, "wait_vdst", "WaitVDST">;
 def wait_exp : NamedIntOperand<i8, "wait_exp", "WaitEXP">;
+def wait_va_vdst : NamedIntOperand<i8, "wait_va_vdst", "WaitVAVDst">;
+def wait_va_vsrc : NamedIntOperand<i8, "wait_vm_vsrc", "WaitVMVSrc">;
 
 class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
   let OperandNamespace = "AMDGPU";
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8310c6b57dad5..0f127276cfd1b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -30,7 +30,7 @@ include "SMInstructions.td"
 include "FLATInstructions.td"
 include "BUFInstructions.td"
 include "EXPInstructions.td"
-include "LDSDIRInstructions.td"
+include "DSDIRInstructions.td"
 include "VINTERPInstructions.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 59d6ccf513bb9..5e6c34992930b 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -553,7 +553,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         }
         continue;
       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
-                 Opcode == AMDGPU::LDS_DIRECT_LOAD) {
+                 Opcode == AMDGPU::DS_PARAM_LOAD ||
+                 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
+                 Opcode == AMDGPU::DS_DIRECT_LOAD) {
         // Mark these STRICTWQM, but only for the instruction, not its operands.
         // This avoid unnecessarily marking M0 as requiring WQM.
         InstrInfo &II = Instructions[&MI];
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
index 313bd8525c6fd..195c5dabb4d46 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
@@ -1,23 +1,27 @@
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
 
-; GFX11-LABEL: {{^}}lds_direct_load:
-; GFX11: s_mov_b32 m0
+; GCN-LABEL: {{^}}lds_direct_load:
+; GCN: s_mov_b32 m0
 ; GFX11: lds_direct_load v{{[0-9]+}}
-; GFX11: s_mov_b32 m0
+; GFX12: ds_direct_load v{{[0-9]+}}
+; GCN: s_mov_b32 m0
 ; GFX11: lds_direct_load v{{[0-9]+}}
-; GFX11: s_mov_b32 m0
+; GFX12: ds_direct_load v{{[0-9]+}}
+; GCN: s_mov_b32 m0
 ; GFX11: lds_direct_load v{{[0-9]+}}
-; GFX11: s_waitcnt expcnt(2)
-; GFX11: v_add_f32
-; GFX11: buffer_store_b32
-; GFX11: s_waitcnt expcnt(1)
-; GFX11: buffer_store_b32
-; GFX11: s_waitcnt expcnt(0)
-; GFX11: buffer_store_b32
-; GFX11: buffer_store_b32
-; GFX11: buffer_store_b32
-; GFX11: buffer_store_b32
+; GCN: s_waitcnt expcnt(2)
+; GCN: v_add_f32
+; GCN: buffer_store_b32
+; GCN: s_waitcnt expcnt(1)
+; GCN: buffer_store_b32
+; GCN: s_waitcnt expcnt(0)
+; GCN: buffer_store_b32
+; GCN: buffer_store_b32
+; GCN: buffer_store_b32
+; GCN: buffer_store_b32
 define amdgpu_ps void @lds_direct_load(ptr addrspace(8) inreg %buf, i32 inreg %arg0,
                                        i32 inreg %arg1, i32 inreg %arg2) #0 {
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
index 5a8b03e50e2ee..1ab753d75fe03 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
@@ -1,25 +1,32 @@
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
 
-; GFX11-LABEL: {{^}}lds_param_load:
-; GFX11: s_mov_b32 m0
+; GCN-LABEL: {{^}}lds_param_load:
+; GCN: s_mov_b32 m0
 ; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.x
 ; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.y
 ; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.z
 ; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.w
 ; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr1.x
-; GFX11: s_waitcnt expcnt(4)
-; GFX11: v_add_f32
-; GFX11: buffer_store_b32
-; GFX11: s_waitcnt expcnt(3)
-; GFX11: buffer_store_b32
-; GFX11: s_waitcnt expcnt(2)
-; GFX11: buffer_store_b32
-; GFX11: s_waitcnt expcnt(1)
-; GFX11: buffer_store_b32
-; GFX11: s_waitcnt expcnt(0)
-; GFX11: buffer_store_b32
-; GFX11: buffer_store_b32
+; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr0.x
+; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr0.y
+; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr0.z
+; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr0.w
+; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr1.x
+; GCN: s_waitcnt expcnt(4)
+; GCN: v_add_f32
+; GCN: buffer_store_b32
+; GCN: s_waitcnt expcnt(3)
+; GCN: buffer_store_b32
+; GCN: s_waitcnt expcnt(2)
+; GCN: buffer_store_b32
+; GCN: s_waitcnt expcnt(1)
+; GCN: buffer_store_b32
+; GCN: s_waitcnt expcnt(0)
+; GCN: buffer_store_b32
+; GCN: buffer_store_b32
 define amdgpu_ps void @lds_param_load(ptr addrspace(8) inreg %buf, i32 inreg %arg) #0 {
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %arg)
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vdsdir.s b/llvm/test/MC/AMDGPU/gfx12_asm_vdsdir.s
new file mode 100644
index 0000000000000..dbd732f999922
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vdsdir.s
@@ -0,0 +1,199 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --strict-whitespace -check-prefix=GFX12 %s
+
+ds_direct_load v1 wait_va_vdst:15
+// GFX12: ds_direct_load v1 wait_va_vdst:15 wait_vm_vsrc:0 ; encoding: [0x01,0x00,0x1f,0xce]
+
+ds_direct_load v2 wait_va_vdst:14
+// GFX12: ds_direct_load v2 wait_va_vdst:14 wait_vm_vsrc:0 ; encoding: [0x02,0x00,0x1e,0xce]
+
+ds_direct_load v3 wait_va_vdst:13
+// GFX12: ds_direct_load v3 wait_va_vdst:13 wait_vm_vsrc:0 ; encoding: [0x03,0x00,0x1d,0xce]
+
+ds_direct_load v4 wait_va_vdst:12
+// GFX12: ds_direct_load v4 wait_va_vdst:12 wait_vm_vsrc:0 ; encoding: [0x04,0x00,0x1c,0xce]
+
+ds_direct_load v5 wait_va_vdst:11
+// GFX12: ds_direct_load v5 wait_va_vdst:11 wait_vm_vsrc:0 ; encoding: [0x05,0x00,0x1b,0xce]
+
+ds_direct_load v6 wait_va_vdst:10
+// GFX12: ds_direct_load v6 wait_va_vdst:10 wait_vm_vsrc:0 ; encoding: [0x06,0x00,0x1a,0xce]
+
+ds_direct_load v7 wait_va_vdst:9
+// GFX12: ds_direct_load v7 wait_va_vdst:9 wait_vm_vsrc:0 ; encoding: [0x07,0x00,0x19,0xce]
+
+ds_direct_load v8 wait_va_vdst:8
+// GFX12: ds_direct_load v8 wait_va_vdst:8 wait_vm_vsrc:0 ; encoding: [0x08,0x00,0x18,0xce]
+
+ds_direct_load v9 wait_va_vdst:7
+// GFX12: ds_direct_load v9 wait_va_vdst:7 wait_vm_vsrc:0 ; encoding: [0x09,0x00,0x17,0xce]
+
+ds_direct_load v10 wait_va_vdst:6
+// GFX12: ds_direct_load v10 wait_va_vdst:6 wait_vm_vsrc:0 ; encoding: [0x0a,0x00,0x16,0xce]
+
+ds_direct_load v11 wait_va_vdst:5
+// GFX12: ds_direct_load v11 wait_va_vdst:5 wait_vm_vsrc:0 ; encoding: [0x0b,0x00,0x15,0xce]
+
+ds_direct_load v12 wait_va_vdst:4
+// GFX12: ds_direct_load v12 wait_va_vdst:4 wait_vm_vsrc:0 ; encoding: [0x0c,0x00,0x14,0xce]
+
+ds_direct_load v13 wait_va_vdst:3
+// GFX12: ds_direct_load v13 wait_va_vdst:3 wait_vm_vsrc:0 ; encoding: [0x0d,0x00,0x13,0xce]
+
+ds_direct_load v14 wait_va_vdst:2
+// GFX12: ds_direct_load v14 wait_va_vdst:2 wait_vm_vsrc:0 ; encoding: [0x0e,0x00,0x12,0xce]
+
+ds_direct_load v15 wait_va_vdst:1
+// GFX12: ds_direct_load v15 wait_va_vdst:1 wait_vm_vsrc:0 ; encoding: [0x0f,0x00,0x11,0xce]
+
+ds_direct_load v16 wait_va_vdst:0
+// GFX12: ds_direct_load v16 wait_va_vdst:0 wait_vm_vsrc:0 ; encoding: [0x10,0x00,0x10,0xce]
+
+ds_direct_load v17
+// GFX12: ds_direct_load v17 wait_va_vdst:0 wait_vm_vsrc:0 ; encoding: [0x11,0x00,0x10,0xce]
+
+ds_param_load v1, attr0.x wait_va_vdst:15
+// GFX12: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:0 ; encoding: [0x01,0x00,0x0f,0xce]
+
+ds_param_load v2, attr0.y wait_va_vdst:14
+// GFX12: ds_param_load v2, attr0.y wait_va_vdst:14 wait_vm_vsrc:0 ; encoding: [0x02,0x01,0x0e,0xce]
+
+ds_param_load v3, attr0.z wait_va_vdst:13
+// GFX12: ds_param_load v3, attr0.z wait_va_vdst:13 wait_vm_vsrc:0 ; encoding: [0x03,0x02,0x0d,0xce]
+
+ds_param_load v4, attr0.w wait_va_vdst:12
+// GFX12: ds_param_load v4, attr0.w wait_va_vdst:12 wait_vm_vsrc:0 ; encoding: [0x04,0x03,0x0c,0xce]
+
+ds_param_load v5, attr0.x wait_va_vdst:11
+// GFX12: ds_param_load v5, attr0.x wait_va_vdst:11 wait_vm_vsrc:0 ; encoding: [0x05,0x00,0x0b,0xce]
+
+ds_param_load v6, attr1.x wait_va_vdst:10
+// GFX12: ds_param_load v6, attr1.x wait_va_vdst:10 wait_vm_vsrc:0 ; encoding: [0x06,0x04,0x0a,0xce]
+
+ds_param_load v7, attr2.y wait_va_vdst:9
+// GFX12: ds_param_load v7, attr2.y wait_va_vdst:9 wait_vm_vsrc:0 ; encoding: [0x07,0x09,0x09,0xce]
+
+ds_param_load v8, attr3.z wait_va_vdst:8
+// GFX12: ds_param_load v8, attr3.z wait_va_vdst:8 wait_vm_vsrc:0 ; encoding: [0x08,0x0e,0x08,0xce]
+
+ds_param_load v9, attr4.w wait_va_vdst:7
+// GFX12: ds_param_load v9, attr4.w wait_va_vdst:7 wait_vm_vsrc:0 ; encoding: [0x09,0x13,0x07,0xce]
+
+ds_param_load v10, attr11.x wait_va_vdst:6
+// GFX12: ds_param_load v10, attr11.x wait_va_vdst:6 wait_vm_vsrc:0 ; encoding: [0x0a,0x2c,0x06,0xce]
+
+ds_param_load v11, attr22.y wait_va_vdst:5
+// GFX12: ds_param_load v11, attr22.y wait_va_vdst:5 wait_vm_vsrc:0 ; encoding: [0x0b,0x59,0x05,0xce]
+
+ds_param_load v13, attr32.x wait_va_vdst:3
+// GFX12: ds_param_load v13, attr32.x wait_va_vdst:3 wait_vm_vsrc:0 ; encoding: [0x0d,0x80,0x03,0xce]
+
+ds_param_load v14, attr32.y wait_va_vdst:2
+// GFX12: ds_param_load v14, attr32.y wait_va_vdst:2 wait_vm_vsrc:0 ; encoding: [0x0e,0x81,0x02,0xce]
+
+ds_param_load v15, attr32.z wait_va_vdst:1
+// GFX12: ds_param_load v15, attr32.z wait_va_vdst:1 wait_vm_vsrc:0 ; encoding: [0x0f,0x82,0x01,0xce]
+
+ds_param_load v16, attr32.w wait_va_vdst:0
+// GFX12: ds_param_load v16, attr32.w wait_va_vdst:0 wait_vm_vsrc:0 ; encoding: [0x10,0x83,0x00,0xce]
+
+ds_param_load v17, attr32.w
+// GFX12: ds_param_load v17, attr32.w wait_va_vdst:0 wait_vm_vsrc:0 ; encoding: [0x11,0x83,0x00,0xce]
+
+ds_direct_load v1 wait_va_vdst:15 wait_vm_vsrc:1
+// GFX12: ds_direct_load v1 wait_va_vdst:15 wait_vm_vsrc:1 ; encoding: [0x01,0x00,0x9f,0xce]
+
+ds_direct_load v2 wait_va_vdst:14 wait_vm_vsrc:1
+// GFX12: ds_direct_load v2 wait_va_vdst:14 wait_vm_vsrc:1 ; encoding: [0x02,0x00,0x9e,0xce]
+
+ds_direct_load v3 wait_va_vdst:13 wait_vm_vsrc:1
+// GFX12: ds_direct_load v3 wait_va_vdst:13 wait_vm_vsrc:1 ; encoding: [0x03,0x00,0x9d,0xce]
+
+ds_direct_load v4 wait_va_vdst:12 wait_vm_vsrc:1
+// GFX12: ds_direct_load v4 wait_va_vdst:12 wait_vm_vsrc:1 ; encoding: [0x04,0x00,0x9c,0xce]
+
+ds_direct_load v5 wait_va_vdst:11 wait_vm_vsrc:1
+// GFX12: ds_direct_load v5 wait_va_vdst:11 wait_vm_vsrc:1 ; encoding: [0x05,0x00,0x9b,0xce]
+
+ds_direct_load v6 wait_va_vdst:10 wait_vm_vsrc:1
+// GFX12: ds_direct_load v6 wait_va_vdst:10 wait_vm_vsrc:1 ; encoding: [0x06,0x00,0x9a,0xce]
+
+ds_direct_load v7 wait_va_vdst:9 wait_vm_vsrc:1
+// GFX12: ds_direct_load v7 wait_va_vdst:9 wait_vm_vsrc:1 ; encoding: [0x07,0x00,0x99,0xce]
+
+ds_direct_load v8 wait_va_vdst:8 wait_vm_vsrc:1
+// GFX12: ds_direct_load v8 wait_va_vdst:8 wait_vm_vsrc:1 ; encoding: [0x08,0x00,0x98,0xce]
+
+ds_direct_load v9 wait_va_vdst:7 wait_vm_vsrc:1
+// GFX12: ds_direct_load v9 wait_va_vdst:7 wait_vm_vsrc:1 ; encoding: [0x09,0x00,0x97,0xce]
+
+ds_direct_load v10 wait_va_vdst:6 wait_vm_vsrc:1
+// GFX12: ds_direct_load v10 wait_va_vdst:6 wait_vm_vsrc:1 ; encoding: [0x0a,0x00,0x96,0xce]
+
+ds_direct_load v11 wait_va_vdst:5 wait_vm_vsrc:1
+// GFX12: ds_direct_load v11 wait_va_vdst:5 wait_vm_vsrc:1 ; encoding: [0x0b,0x00,0x95,0xce]
+
+ds_direct_load v12 wait_va_vdst:4 wait_vm_vsrc:1
+// GFX12: ds_direct_load v12 wait_va_vdst:4 wait_vm_vsrc:1 ; encoding: [0x0c,0x00,0x94,0xce]
+
+ds_direct_load v13 wait_va_vdst:3 wait_vm_vsrc:1
+// GFX12: ds_direct_load v13 wait_va_vdst:3 wait_vm_vsrc:1 ; encoding: [0x0d,0x00,0x93,0xce]
+
+ds_direct_load v14 wait_va_vdst:2 wait_vm_vsrc:1
+// GFX12: ds_direct_load v14 wait_va_vdst:2 wait_vm_vsrc:1 ; encoding: [0x0e,0x00,0x92,0xce]
+
+ds_direct_load v15 wait_va_vdst:1 wait_vm_vsrc:1
+// GFX12: ds_direct_load v15 wait_va_vdst:1 wait_vm_vsrc:1 ; encoding: [0x0f,0x00,0x91,0xce]
+
+ds_direct_load v16 wait_va_vdst:0 wait_vm_vsrc:1
+// GFX12: ds_direct_load v16 wait_va_vdst:0 wait_vm_vsrc:1 ; encoding: [0x10,0x00,0x90,0xce]
+
+ds_direct_load v17 wait_vm_vsrc:1
+// GFX12: ds_direct_load v17 wait_va_vdst:0 wait_vm_vsrc:1 ; encoding: [0x11,0x00,0x90,0xce]
+
+ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
+// GFX12: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 ; encoding: [0x01,0x00,0x8f,0xce]
+
+ds_param_load v2, attr0.y wait_va_vdst:14 wait_vm_vsrc:1
+// GFX12: ds_param_load v2, attr0.y wait_va_vdst:14 wait_vm_vsrc:1 ; encoding: [0x02,0x01,0x8e,0xce]
+
+ds_param_load v3, attr0.z wait_va_vdst:13 wait_vm_vsrc:1
+// GFX12: ds_param_load v3, attr0.z wait_va_vdst:13 wait_vm_vsrc:1 ; encoding: [0x03,0x02,0x8d,0xce]
+
+ds_param_load v4, attr0.w wait_va_vdst:12 wait_vm_vsrc:1
+// GFX12: ds_param_load v4, attr0.w wait_va_vdst:12 wait_vm_vsrc:1 ; encoding: [0x04,0x03,0x8c,0xce]
+
+ds_param_load v5, attr0.x wait_va_vdst:11 wait_vm_vsrc:1
+// GFX12: ds_param_load v5, attr0.x wait_va_vdst:11 wait_vm_vsrc:1 ; encoding: [0x05,0x00,0x8b,0xce]
+
+ds_param_load v6, attr1.x wait_va_vdst:10 wait_vm_vsrc:1
+// GFX12: ds_param_load v6, attr1.x wait_va_vdst:10 wait_vm_vsrc:1 ; encoding: [0x06,0x04,0x8a,0xce]
+
+ds_param_load v7, attr2.y wait_va_vdst:9 wait_vm_vsrc:1
+// GFX12: ds_param_load v7, attr2.y wait_va_vdst:9 wait_vm_vsrc:1 ; encoding: [0x07,0x09,0x89,0xce]
+
+ds_param_load v8, attr3.z wait_va_vdst:8 wait_vm_vsrc:1
+// GFX12: ds_param_load v8, attr3.z wait_va_vdst:8 wait_vm_vsrc:1 ; encoding: [0x08,0x0e,0x88,0xce]
+
+ds_param_load v9, attr4.w wait_va_vdst:7 wait_vm_vsrc:1
+// GFX12: ds_param_load v9, attr4.w wait_va_vdst:7 wait_vm_vsrc:1 ; encoding: [0x09,0x13,0x87,0xce]
+
+ds_param_load v10, attr11.x wait_va_vdst:6 wait_vm_vsrc:1
+// GFX12: ds_param_load v10, attr11.x wait_va_vdst:6 wait_vm_vsrc:1 ; encoding: [0x0a,0x2c,0x86,0xce]
+
+ds_param_load v11, attr22.y wait_va_vdst:5 wait_vm_vsrc:1
+// GFX12: ds_param_load v11, attr22.y wait_va_vdst:5 wait_vm_vsrc:1 ; encoding: [0x0b,0x59,0x85,0xce]
+
+ds_param_load v13, attr32.x wait_va_vdst:3 wait_vm_vsrc:1
+// GFX12: ds_param_load v13, attr32.x wait_va_vdst:3 wait_vm_vsrc:1 ; encoding: [0x0d,0x80,0x83,0xce]
+
+ds_param_load v14, attr32.y wait_va_vdst:2 wait_vm_vsrc:1
+// GFX12: ds_param_load v14, attr32.y wait_va_vdst:2 wait_vm_vsrc:1 ; encoding: [0x0e,0x81,0x82,0xce]
+
+ds_param_load v15, attr32.z wait_va_vdst:1 wait_vm_vsrc:1
+// GFX12: ds_param_load v15, attr32.z wait_va_vdst:1 wait_vm_vsrc:1 ; encoding: [0x0f,0x82,0x81,0xce]
+
+ds_param_load v16, attr32.w wait_va_vdst:0 wait_vm_vsrc:1
+// GFX12: ds_param_load v16, attr32.w wait_va_vdst:0 wait_vm_vsrc:1 ; encoding: [0x10,0x83,0x80,0xce]
+
+ds_param_load v17, attr32.w wait_vm_vsrc:1
+// GFX12: ds_param_load v17, attr32.w wait_va_vdst:0 wait_vm_vsrc:1 ; encoding: [0x11,0x83,0x80,0xce]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vdsdir.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vdsdir.txt
new file mode 100644
index 0000000000000..b7c0394429dc3
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vdsdir.txt
@@ -0,0 +1,206 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck --strict-whitespace -check-prefix=GFX12 %s
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck --strict-whitespace -check-prefix=GFX12 %s
+
+# GFX12: ds_direct_load v10 wait_va_vdst:6 wait_vm_vsrc:0 ; encoding: [0x0a,0x00,0x16,0xce]
+0x0a,0x00,0x16,0xce
+
+# GFX12: ds_direct_load v11 wait_va_vdst:5 wait_vm_vsrc:0 ; encoding: [0x0b,0x00,0x15,0xce]
+0x0b,0x00,0x15,0xce
+
+# GFX12: ds_direct_load v12 wait_va_vdst:4 wait_vm_vsrc:0 ; encoding: [0x0c,0x00,0x14,0xce]
+0x0c,0x00,0x14,0xce
+
+# GFX12: ds_direct_load v13 wait_va_vdst:3 wait_vm_vsrc:0 ; encoding: [0x0d,0x00,0x13,0xce]
+0x0d,0x00,0x13,0xce
+
+# GFX12: ds_direct_load v14 wait_va_vdst:2 wait_vm_vsrc:0 ; encoding: [0x0e,0x00,0x12,0xce]
+0x0e,0x00,0x12,0xce
+
+# GFX12: ds_direct_load v15 wait_va_vdst:1 wait_vm_vsrc:0 ; encoding: [0x0f,0x00,0x11,0xce]
+0x0f,0x00,0x11,0xce
+
+# GFX12: ds_direct_load v16 wait_va_vdst:0 wait_vm_vsrc:0 ; encoding: [0x10,0x00,0x10,0xce]
+0x10,0x00,0x10,0xce
+
+# GFX12: ds_direct_load v17 wait_va_vdst:0 wait_vm_vsrc:0 ; encoding: [0x11,0x00,0x10,0xce]
+0x11,0x00,0x10,0xce
+
+# GFX12: ds_direct_load v1 wait_va_vdst:15 wait_vm_vsrc:0 ; encoding: [0x01,0x00,0x1f,0xce]
+0x01,0x00,0x1f,0xce
+
+# GFX12: ds_direct_load v2 wait_va_vdst:14 wait_vm_vsrc:0 ; encoding: [0x02,0x00,0x1e,0xce]
+0x02,0x00,0x1e,0xce
+
+# GFX12: ds_direct_load v3 wait_va_vdst:13 wait_vm_vsrc:0 ; encoding: [0x03,0x00,0x1d,0xce]
+0x03,0x00,0x1d,0xce
+
+# GFX12: ds_direct_load v4 wait_va_vdst:12 wait_vm_vsrc:0 ; encoding: [0x04,0x00,0x1c,0xce]
+0x04,0x00,0x1c,0xce
+
+# GFX12: ds_direct_load v5 wait_va_vdst:11 wait_vm_vsrc:0 ; encoding: [0x05,0x00,0x1b,0xce]
+0x05,0x00,0x1b,0xce
+
+# GFX12: ds_direct_load v6 wait_va_vdst:10 wait_vm_vsrc:0 ; encoding: [0x06,0x00,0x1a,0xce]
+0x06,0x00,0x1a,0xce
+
+# GFX12: ds_direct_load v7 wait_va_vdst:9 wait_vm_vsrc:0 ; encoding: [0x07,0x00,0x19,0xce]
+0x07,0x00,0x19,0xce
+
+# GFX12: ds_direct_load v8 wait_va_vdst:8 wait_vm_vsrc:0 ; encoding: [0x08,0x00,0x18,0xce]
+0x08,0x00,0x18,0xce
+
+# GFX12: ds_direct_load v9 wait_va_vdst:7 wait_vm_vsrc:0 ; encoding: [0x09,0x00,0x17,0xce]
+0x09,0x00,0x17,0xce
+
+# GFX12: ds_param_load v10, attr11.x wait_va_vdst:6 wait_vm_vsrc:0 ; encoding: [0x0a,0x2c,0x06,0xce]
+0x0a,0x2c,0x06,0xce
+
+# GFX12: ds_param_load v11, attr22.y wait_va_vdst:5 wait_vm_vsrc:0 ; encoding: [0x0b,0x59,0x05,0xce]
+0x0b,0x59,0x05,0xce
+
+# GFX12: ds_param_load v12, attr33.z wait_va_vdst:4 wait_vm_vsrc:0 ; encoding: [0x0c,0x86,0x04,0xce]
+0x0c,0x86,0x04,0xce
+
+# GFX12: ds_param_load v13, attr63.x wait_va_vdst:3 wait_vm_vsrc:0 ; encoding: [0x0d,0xfc,0x03,0xce]
+0x0d,0xfc,0x03,0xce
+
+# GFX12: ds_param_load v14, attr63.y wait_va_vdst:2 wait_vm_vsrc:0 ; encoding: [0x0e,0xfd,0x02,0xce]
+0x0e,0xfd,0x02,0xce
+
+# GFX12: ds_param_load v15, attr63.z wait_va_vdst:1 wait_vm_vsrc:0 ; encoding: [0x0f,0xfe,0x01,0xce]
+0x0f,0xfe,0x01,0xce
+
+# GFX12: ds_param_load v16, attr63.w wait_va_vdst:0 wait_vm_vsrc:0 ; encoding: [0x10,0xff,0x00,0xce]
+0x10,0xff,0x00,0xce
+
+# GFX12: ds_param_load v17, attr63.w wait_va_vdst:0 wait_vm_vsrc:0 ; encoding: [0x11,0xff,0x00,0xce]
+0x11,0xff,0x00,0xce
+
+# GFX12: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:0 ; encoding: [0x01,0x00,0x0f,0xce]
+0x01,0x00,0x0f,0xce
+
+# GFX12: ds_param_load v2, attr0.y wait_va_vdst:14 wait_vm_vsrc:0 ; encoding: [0x02,0x01,0x0e,0xce]
+0x02,0x01,0x0e,0xce
+
+# GFX12: ds_param_load v3, attr0.z wait_va_vdst:13 wait_vm_vsrc:0 ; encoding: [0x03,0x02,0x0d,0xce]
+0x03,0x02,0x0d,0xce
+
+# GFX12: ds_param_load v4, attr0.w wait_va_vdst:12 wait_vm_vsrc:0 ; encoding: [0x04,0x03,0x0c,0xce]
+0x04,0x03,0x0c,0xce
+
+# GFX12: ds_param_load v5, attr0.x wait_va_vdst:11 wait_vm_vsrc:0 ; encoding: [0x05,0x00,0x0b,0xce]
+0x05,0x00,0x0b,0xce
+
+# GFX12: ds_param_load v6, attr1.x wait_va_vdst:10 wait_vm_vsrc:0 ; encoding: [0x06,0x04,0x0a,0xce]
+0x06,0x04,0x0a,0xce
+
+# GFX12: ds_param_load v7, attr2.y wait_va_vdst:9 wait_vm_vsrc:0 ; encoding: [0x07,0x09,0x09,0xce]
+0x07,0x09,0x09,0xce
+
+# GFX12: ds_param_load v8, attr3.z wait_va_vdst:8 wait_vm_vsrc:0 ; encoding: [0x08,0x0e,0x08,0xce]
+0x08,0x0e,0x08,0xce
+
+# GFX12: ds_param_load v9, attr4.w wait_va_vdst:7 wait_vm_vsrc:0 ; encoding: [0x09,0x13,0x07,0xce]
+0x09,0x13,0x07,0xce
+
+# GFX12: ds_direct_load v10 wait_va_vdst:6 wait_vm_vsrc:1 ; encoding: [0x0a,0x00,0x96,0xce]
+0x0a,0x00,0x96,0xce
+
+# GFX12: ds_direct_load v11 wait_va_vdst:5 wait_vm_vsrc:1 ; encoding: [0x0b,0x00,0x95,0xce]
+0x0b,0x00,0x95,0xce
+
+# GFX12: ds_direct_load v12 wait_va_vdst:4 wait_vm_vsrc:1 ; encoding: [0x0c,0x00,0x94,0xce]
+0x0c,0x00,0x94,0xce
+
+# GFX12: ds_direct_load v13 wait_va_vdst:3 wait_vm_vsrc:1 ; encoding: [0x0d,0x00,0x93,0xce]
+0x0d,0x00,0x93,0xce
+
+# GFX12: ds_direct_load v14 wait_va_vdst:2 wait_vm_vsrc:1 ; encoding: [0x0e,0x00,0x92,0xce]
+0x0e,0x00,0x92,0xce
+
+# GFX12: ds_direct_load v15 wait_va_vdst:1 wait_vm_vsrc:1 ; encoding: [0x0f,0x00,0x91,0xce]
+0x0f,0x00,0x91,0xce
+
+# GFX12: ds_direct_load v16 wait_va_vdst:0 wait_vm_vsrc:1 ; encoding: [0x10,0x00,0x90,0xce]
+0x10,0x00,0x90,0xce
+
+# GFX12: ds_direct_load v17 wait_va_vdst:0 wait_vm_vsrc:1 ; encoding: [0x11,0x00,0x90,0xce]
+0x11,0x00,0x90,0xce
+
+# GFX12: ds_direct_load v1 wait_va_vdst:15 wait_vm_vsrc:1 ; encoding: [0x01,0x00,0x9f,0xce]
+0x01,0x00,0x9f,0xce
+
+# GFX12: ds_direct_load v2 wait_va_vdst:14 wait_vm_vsrc:1 ; encoding: [0x02,0x00,0x9e,0xce]
+0x02,0x00,0x9e,0xce
+
+# GFX12: ds_direct_load v3 wait_va_vdst:13 wait_vm_vsrc:1 ; encoding: [0x03,0x00,0x9d,0xce]
+0x03,0x00,0x9d,0xce
+
+# GFX12: ds_direct_load v4 wait_va_vdst:12 wait_vm_vsrc:1 ; encoding: [0x04,0x00,0x9c,0xce]
+0x04,0x00,0x9c,0xce
+
+# GFX12: ds_direct_load v5 wait_va_vdst:11 wait_vm_vsrc:1 ; encoding: [0x05,0x00,0x9b,0xce]
+0x05,0x00,0x9b,0xce
+
+# GFX12: ds_direct_load v6 wait_va_vdst:10 wait_vm_vsrc:1 ; encoding: [0x06,0x00,0x9a,0xce]
+0x06,0x00,0x9a,0xce
+
+# GFX12: ds_direct_load v7 wait_va_vdst:9 wait_vm_vsrc:1 ; encoding: [0x07,0x00,0x99,0xce]
+0x07,0x00,0x99,0xce
+
+# GFX12: ds_direct_load v8 wait_va_vdst:8 wait_vm_vsrc:1 ; encoding: [0x08,0x00,0x98,0xce]
+0x08,0x00,0x98,0xce
+
+# GFX12: ds_direct_load v9 wait_va_vdst:7 wait_vm_vsrc:1 ; encoding: [0x09,0x00,0x97,0xce]
+0x09,0x00,0x97,0xce
+
+# GFX12: ds_param_load v10, attr11.x wait_va_vdst:6 wait_vm_vsrc:1 ; encoding: [0x0a,0x2c,0x86,0xce]
+0x0a,0x2c,0x86,0xce
+
+# GFX12: ds_param_load v11, attr22.y wait_va_vdst:5 wait_vm_vsrc:1 ; encoding: [0x0b,0x59,0x85,0xce]
+0x0b,0x59,0x85,0xce
+
+# GFX12: ds_param_load v12, attr33.z wait_va_vdst:4 wait_vm_vsrc:1 ; encoding: [0x0c,0x86,0x84,0xce]
+0x0c,0x86,0x84,0xce
+
+# GFX12: ds_param_load v13, attr63.x wait_va_vdst:3 wait_vm_vsrc:1 ; encoding: [0x0d,0xfc,0x83,0xce]
+0x0d,0xfc,0x83,0xce
+
+# GFX12: ds_param_load v14, attr63.y wait_va_vdst:2 wait_vm_vsrc:1 ; encoding: [0x0e,0xfd,0x82,0xce]
+0x0e,0xfd,0x82,0xce
+
+# GFX12: ds_param_load v15, attr63.z wait_va_vdst:1 wait_vm_vsrc:1 ; encoding: [0x0f,0xfe,0x81,0xce]
+0x0f,0xfe,0x81,0xce
+
+# GFX12: ds_param_load v16, attr63.w wait_va_vdst:0 wait_vm_vsrc:1 ; encoding: [0x10,0xff,0x80,0xce]
+0x10,0xff,0x80,0xce
+
+# GFX12: ds_param_load v17, attr63.w wait_va_vdst:0 wait_vm_vsrc:1 ; encoding: [0x11,0xff,0x80,0xce]
+0x11,0xff,0x80,0xce
+
+# GFX12: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 ; encoding: [0x01,0x00,0x8f,0xce]
+0x01,0x00,0x8f,0xce
+
+# GFX12: ds_param_load v2, attr0.y wait_va_vdst:14 wait_vm_vsrc:1 ; encoding: [0x02,0x01,0x8e,0xce]
+0x02,0x01,0x8e,0xce
+
+# GFX12: ds_param_load v3, attr0.z wait_va_vdst:13 wait_vm_vsrc:1 ; encoding: [0x03,0x02,0x8d,0xce]
+0x03,0x02,0x8d,0xce
+
+# GFX12: ds_param_load v4, attr0.w wait_va_vdst:12 wait_vm_vsrc:1 ; encoding: [0x04,0x03,0x8c,0xce]
+0x04,0x03,0x8c,0xce
+
+# GFX12: ds_param_load v5, attr0.x wait_va_vdst:11 wait_vm_vsrc:1 ; encoding: [0x05,0x00,0x8b,0xce]
+0x05,0x00,0x8b,0xce
+
+# GFX12: ds_param_load v6, attr1.x wait_va_vdst:10 wait_vm_vsrc:1 ; encoding: [0x06,0x04,0x8a,0xce]
+0x06,0x04,0x8a,0xce
+
+# GFX12: ds_param_load v7, attr2.y wait_va_vdst:9 wait_vm_vsrc:1 ; encoding: [0x07,0x09,0x89,0xce]
+0x07,0x09,0x89,0xce
+
+# GFX12: ds_param_load v8, attr3.z wait_va_vdst:8 wait_vm_vsrc:1 ; encoding: [0x08,0x0e,0x88,0xce]
+0x08,0x0e,0x88,0xce
+
+# GFX12: ds_param_load v9, attr4.w wait_va_vdst:7 wait_vm_vsrc:1 ; encoding: [0x09,0x13,0x87,0xce]
+0x09,0x13,0x87,0xce

From c1eab57673ef3eb2842c0fbe454d7878854cf54c Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Wed, 3 Jan 2024 16:33:27 +0100
Subject: [PATCH 143/313] [mlir] fix Operation::getDiscardableAttrs in absence
 of properties (#76816)

When properties are not enabled in an operation, inherent attributes are
stored in the common dictionary with discardable attributes. However,
`getDiscardableAttrs` and `getDiscardableAttrDictionary` were returning
the entire dictionary, making the caller mistakenly believe that all
inherent attributes are discardable. Fix this by filtering out
attributes whose names are registered with the operation, i.e., inherent
attributes. This requires an API change so `getDiscardableAttrs` returns
a filter range.
---
 mlir/include/mlir/IR/Operation.h       | 30 +++++++++++++------
 mlir/lib/CAPI/IR/IR.cpp                |  6 ++--
 mlir/lib/IR/AsmPrinter.cpp             |  2 +-
 mlir/unittests/IR/OpPropertiesTest.cpp | 40 +++++++++++++++++++++++++-
 4 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
index 3ba0daeca0258..d2f52cf1afee8 100644
--- a/mlir/include/mlir/IR/Operation.h
+++ b/mlir/include/mlir/IR/Operation.h
@@ -475,19 +475,33 @@ class alignas(8) Operation final
     return removeDiscardableAttr(StringAttr::get(getContext(), name));
   }
 
-  /// Return all of the discardable attributes on this operation.
-  ArrayRef<NamedAttribute> getDiscardableAttrs() { return attrs.getValue(); }
+  /// Return a range of all of discardable attributes on this operation. Note
+  /// that for unregistered operations that are not storing inherent attributes
+  /// as properties, all attributes are considered discardable.
+  auto getDiscardableAttrs() {
+    std::optional<RegisteredOperationName> opName = getRegisteredInfo();
+    ArrayRef<StringAttr> attributeNames =
+        opName ? getRegisteredInfo()->getAttributeNames()
+               : ArrayRef<StringAttr>();
+    return llvm::make_filter_range(
+        attrs.getValue(),
+        [this, attributeNames](const NamedAttribute attribute) {
+          return getPropertiesStorage() ||
+                 !llvm::is_contained(attributeNames, attribute.getName());
+        });
+  }
 
   /// Return all of the discardable attributes on this operation as a
   /// DictionaryAttr.
-  DictionaryAttr getDiscardableAttrDictionary() { return attrs; }
+  DictionaryAttr getDiscardableAttrDictionary() {
+    if (getPropertiesStorage())
+      return attrs;
+    return DictionaryAttr::get(getContext(),
+                               llvm::to_vector(getDiscardableAttrs()));
+  }
 
   /// Return all of the attributes on this operation.
-  ArrayRef<NamedAttribute> getAttrs() {
-    if (!getPropertiesStorage())
-      return getDiscardableAttrs();
-    return getAttrDictionary().getValue();
-  }
+  ArrayRef<NamedAttribute> getAttrs() { return getAttrDictionary().getValue(); }
 
   /// Return all of the attributes on this operation as a DictionaryAttr.
   DictionaryAttr getAttrDictionary();
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index ac9889df11f80..a97cfe5b0ea36 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -613,12 +613,14 @@ void mlirOperationSetInherentAttributeByName(MlirOperation op,
 }
 
 intptr_t mlirOperationGetNumDiscardableAttributes(MlirOperation op) {
-  return static_cast<intptr_t>(unwrap(op)->getDiscardableAttrs().size());
+  return static_cast<intptr_t>(
+      llvm::range_size(unwrap(op)->getDiscardableAttrs()));
 }
 
 MlirNamedAttribute mlirOperationGetDiscardableAttribute(MlirOperation op,
                                                         intptr_t pos) {
-  NamedAttribute attr = unwrap(op)->getDiscardableAttrs()[pos];
+  NamedAttribute attr =
+      *std::next(unwrap(op)->getDiscardableAttrs().begin(), pos);
   return MlirNamedAttribute{wrap(attr.getName()), wrap(attr.getValue())};
 }
 
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 1f7cbf349255d..0cc6d11ddc576 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -3543,7 +3543,7 @@ void OperationPrinter::printGenericOp(Operation *op, bool printOpName) {
   }
 
   auto attrs = op->getDiscardableAttrs();
-  printOptionalAttrDict(attrs);
+  printOptionalAttrDict(llvm::to_vector(attrs));
 
   // Print the type signature of the operation.
   os << " : ";
diff --git a/mlir/unittests/IR/OpPropertiesTest.cpp b/mlir/unittests/IR/OpPropertiesTest.cpp
index 5c3e150f6b94c..6dcba16e656b3 100644
--- a/mlir/unittests/IR/OpPropertiesTest.cpp
+++ b/mlir/unittests/IR/OpPropertiesTest.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Parser/Parser.h"
 #include "gtest/gtest.h"
@@ -132,6 +133,23 @@ class OpWithProperties : public Op<OpWithProperties> {
   }
 };
 
+/// A custom operation for the purpose of showcasing how discardable attributes
+/// are handled in absence of properties.
+class OpWithoutProperties : public Op<OpWithoutProperties> {
+public:
+  // Begin boilerplate.
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OpWithoutProperties)
+  using Op::Op;
+  static ArrayRef<StringRef> getAttributeNames() {
+    static StringRef attributeNames[] = {StringRef("inherent_attr")};
+    return ArrayRef(attributeNames);
+  };
+  static StringRef getOperationName() {
+    return "test_op_properties.op_without_properties";
+  }
+  // End boilerplate.
+};
+
 // A trivial supporting dialect to register the above operation.
 class TestOpPropertiesDialect : public Dialect {
 public:
@@ -142,7 +160,7 @@ class TestOpPropertiesDialect : public Dialect {
   explicit TestOpPropertiesDialect(MLIRContext *context)
       : Dialect(getDialectNamespace(), context,
                 TypeID::get<TestOpPropertiesDialect>()) {
-    addOperations<OpWithProperties>();
+    addOperations<OpWithProperties, OpWithoutProperties>();
   }
 };
 
@@ -359,4 +377,24 @@ TEST(OpPropertiesTest, getOrAddProperties) {
   op->erase();
 }
 
+constexpr StringLiteral withoutPropertiesAttrsSrc = R"mlir(
+    "test_op_properties.op_without_properties"()
+      {inherent_attr = 42, other_attr = 56} : () -> ()
+)mlir";
+
+TEST(OpPropertiesTest, withoutPropertiesDiscardableAttrs) {
+  MLIRContext context;
+  context.getOrLoadDialect<TestOpPropertiesDialect>();
+  ParserConfig config(&context);
+  OwningOpRef<Operation *> op =
+      parseSourceString(withoutPropertiesAttrsSrc, config);
+  ASSERT_EQ(llvm::range_size(op->getDiscardableAttrs()), 1u);
+  EXPECT_EQ(op->getDiscardableAttrs().begin()->getName().getValue(),
+            "other_attr");
+
+  EXPECT_EQ(op->getAttrs().size(), 2u);
+  EXPECT_TRUE(op->getInherentAttr("inherent_attr") != std::nullopt);
+  EXPECT_TRUE(op->getDiscardableAttr("other_attr") != Attribute());
+}
+
 } // namespace

From 21fe8b635cfb04de486a3c4ca22eb9f5a5ed78ad Mon Sep 17 00:00:00 2001
From: "Balaji V. Iyer" <43187390+bviyer@users.noreply.github.com>
Date: Wed, 3 Jan 2024 10:00:15 -0600
Subject: [PATCH 144/313] [mlir] Check if the stride tensor is empty. (#76428)

Added a check to see if the stride tensor is empty. If so then return
false for isContiguousSlice function.

Possible fix for #74463
---
 mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp     |  2 +-
 .../Dialect/Vector/vector-transfer-flatten.mlir   | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 2ad992af989c9..c1c0f5483a6af 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -271,7 +271,7 @@ bool vector::isContiguousSlice(MemRefType memrefType, VectorType vectorType) {
     return false;
 
   // Cond 1: A contiguous memref will always have a unit trailing stride.
-  if (strides.back() != 1)
+  if (strides.empty() || strides.back() != 1)
     return false;
 
   // Cond 2: Strides of a contiguous memref have to match the flattened dims.
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 3708d741141be..ae457ea81ec5b 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -356,3 +356,18 @@ func.func @fold_unit_dims_entirely(%arg0 : vector<8xi32>,
 // CHECK:           %[[VAL_3:.*]] = arith.muli %[[VAL_0]], %[[VAL_1]] : vector<8xi32>
 // CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_3]], %[[VAL_2]] : vector<8xi32>
 // CHECK:           return %[[VAL_4]] : vector<8xi32>
+
+// -----
+
+// This test is to make sure there is no crash for empty stride.
+func.func @stride_empty_test(%1: memref<i16>) -> vector<32x256xi16> {
+  %c0_i16 = arith.constant 0 : i16
+  %3 = vector.transfer_read %1[], %c0_i16 {permutation_map = affine_map<() -> (0, 0)>} : memref<i16>, vector<32x256xi16>
+  return %3 : vector<32x256xi16>
+
+  // CHECK-LABEL: func.func @stride_empty_test
+  // CHECK: %[[VAL:.*]] = arith.constant 0 : i16
+  // CHECK: %[[RET:.*]] = vector.transfer_read {{.*}} vector<32x256xi16>
+  // CHECK: return %[[RET]]
+  // CHECK-NOT: empty()
+}

From 55d5ba905da0db55282dd3985761ddf3dd452fd1 Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Wed, 3 Jan 2024 16:35:59 +0100
Subject: [PATCH 145/313] [NFC] Fix compilation in C++20 mode with GCC 12

I ran into the following compiler error when trying to build with GCC 12
and `-DCMAKE_CXX_STANDARD=20`:
```
llvm-project/clang/lib/Sema/SemaChecking.cpp:16690:16:   required from here
/usr/include/c++/12/type_traits:971:30: error: default member initializer for '{anonymous}::SequenceChecker::Usage::UsageExpr' required before the end of its enclosing class
```

The error seems correct, GCC just instantiates the `SmallDenseMap`
early and detects it. Clang does not, but that's an acceptable
implementation difference as far as the standard is concerned.

Move constructor outside the class to avoid this problem.
---
 clang/lib/Sema/SemaChecking.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index da0570b7b0f1e..3168d38dd66c3 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -16677,7 +16677,7 @@ class SequenceChecker : public ConstEvaluatedExprVisitor<SequenceChecker> {
     /// Have we issued a diagnostic for this object already?
     bool Diagnosed = false;
 
-    UsageInfo() = default;
+    UsageInfo();
   };
   using UsageInfoMap = llvm::SmallDenseMap<Object, UsageInfo, 16>;
 
@@ -17436,6 +17436,8 @@ class SequenceChecker : public ConstEvaluatedExprVisitor<SequenceChecker> {
   }
 };
 
+SequenceChecker::UsageInfo::UsageInfo() = default;
+
 } // namespace
 
 void Sema::CheckUnsequencedOperations(const Expr *E) {

From c17af94b9619997ce7c060ff112988e0ec218c10 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 3 Jan 2024 16:27:10 +0100
Subject: [PATCH 146/313] [ConstraintElim] Use SmallDenseMap (NFC)

The number of variables in the constraint is usually very small.
Use SmallDenseMap to avoid allocations.
---
 llvm/lib/Transforms/Scalar/ConstraintElimination.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 06c87bd6dc374..cc93c8617c05e 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -644,7 +644,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
 
   // First try to look up \p V in Value2Index and NewVariables. Otherwise add a
   // new entry to NewVariables.
-  DenseMap<Value *, unsigned> NewIndexMap;
+  SmallDenseMap<Value *, unsigned> NewIndexMap;
   auto GetOrAddIndex = [&Value2Index, &NewVariables,
                         &NewIndexMap](Value *V) -> unsigned {
     auto V2I = Value2Index.find(V);
@@ -668,7 +668,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
       IsSigned, IsEq, IsNe);
   // Collect variables that are known to be positive in all uses in the
   // constraint.
-  DenseMap<Value *, bool> KnownNonNegativeVariables;
+  SmallDenseMap<Value *, bool> KnownNonNegativeVariables;
   auto &R = Res.Coefficients;
   for (const auto &KV : VariablesA) {
     R[GetOrAddIndex(KV.Variable)] += KV.Coefficient;

From b07bf16a6f57b5bfa487d2291dc246ada37b0dfc Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 3 Jan 2024 17:05:08 +0100
Subject: [PATCH 147/313] [AssumptionCache] Add affected values for
 separate_storage (#76806)

Add the underlying object of both separate_storage arguments as affected
values. This allows us to use assumptionsFor() in BasicAA, which will be
more efficient if there are many assumes in the function.
---
 llvm/lib/Analysis/AssumptionCache.cpp    | 13 +++++++--
 llvm/lib/Analysis/BasicAliasAnalysis.cpp | 37 +++++++++++-------------
 2 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp
index fb3a6f8de2d6a..1b7277df0e0cd 100644
--- a/llvm/lib/Analysis/AssumptionCache.cpp
+++ b/llvm/lib/Analysis/AssumptionCache.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
@@ -77,9 +78,15 @@ findAffectedValues(CallBase *CI, TargetTransformInfo *TTI,
   };
 
   for (unsigned Idx = 0; Idx != CI->getNumOperandBundles(); Idx++) {
-    if (CI->getOperandBundleAt(Idx).Inputs.size() > ABA_WasOn &&
-        CI->getOperandBundleAt(Idx).getTagName() != IgnoreBundleTag)
-      AddAffected(CI->getOperandBundleAt(Idx).Inputs[ABA_WasOn], Idx);
+    OperandBundleUse Bundle = CI->getOperandBundleAt(Idx);
+    if (Bundle.getTagName() == "separate_storage") {
+      assert(Bundle.Inputs.size() == 2 &&
+             "separate_storage must have two args");
+      AddAffected(getUnderlyingObject(Bundle.Inputs[0]), Idx);
+      AddAffected(getUnderlyingObject(Bundle.Inputs[1]), Idx);
+    } else if (Bundle.Inputs.size() > ABA_WasOn &&
+               Bundle.getTagName() != IgnoreBundleTag)
+      AddAffected(Bundle.Inputs[ABA_WasOn], Idx);
   }
 
   Value *Cond = CI->getArgOperand(0), *A, *B;
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 3de147368f234..4ad70931fd4cf 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1544,28 +1544,25 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
     return AliasResult::NoAlias;
 
   if (CtxI && EnableSeparateStorageAnalysis) {
-    for (auto &AssumeVH : AC.assumptions()) {
-      if (!AssumeVH)
+    for (AssumptionCache::ResultElem &Elem : AC.assumptionsFor(O1)) {
+      if (!Elem || Elem.Index == AssumptionCache::ExprResultIdx)
         continue;
 
-      AssumeInst *Assume = cast<AssumeInst>(AssumeVH);
-
-      for (unsigned Idx = 0; Idx < Assume->getNumOperandBundles(); Idx++) {
-        OperandBundleUse OBU = Assume->getOperandBundleAt(Idx);
-        if (OBU.getTagName() == "separate_storage") {
-          assert(OBU.Inputs.size() == 2);
-          const Value *Hint1 = OBU.Inputs[0].get();
-          const Value *Hint2 = OBU.Inputs[1].get();
-          // This is often a no-op; instcombine rewrites this for us. No-op
-          // getUnderlyingObject calls are fast, though.
-          const Value *HintO1 = getUnderlyingObject(Hint1);
-          const Value *HintO2 = getUnderlyingObject(Hint2);
-
-          if (((O1 == HintO1 && O2 == HintO2) ||
-               (O1 == HintO2 && O2 == HintO1)) &&
-              isValidAssumeForContext(Assume, CtxI, DT))
-            return AliasResult::NoAlias;
-        }
+      AssumeInst *Assume = cast<AssumeInst>(Elem);
+      OperandBundleUse OBU = Assume->getOperandBundleAt(Elem.Index);
+      if (OBU.getTagName() == "separate_storage") {
+        assert(OBU.Inputs.size() == 2);
+        const Value *Hint1 = OBU.Inputs[0].get();
+        const Value *Hint2 = OBU.Inputs[1].get();
+        // This is often a no-op; instcombine rewrites this for us. No-op
+        // getUnderlyingObject calls are fast, though.
+        const Value *HintO1 = getUnderlyingObject(Hint1);
+        const Value *HintO2 = getUnderlyingObject(Hint2);
+
+        if (((O1 == HintO1 && O2 == HintO2) ||
+             (O1 == HintO2 && O2 == HintO1)) &&
+            isValidAssumeForContext(Assume, CtxI, DT))
+          return AliasResult::NoAlias;
       }
     }
   }

From f557f05b8dea6d915dba23f310c25655a403735e Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Wed, 3 Jan 2024 16:46:54 +0000
Subject: [PATCH 148/313] [mlir] update InferTypeOpInterface after
 c1eab57673ef3e

The change in c1eab57673ef3eb2842c0fbe454d7878854cf54c fixed the
behavior of `getDiscardableAttrDictionary` for ops that are not using
properties to only return discardable attributes. `InferTypeOpInterface`
was relying on the wrong behavior when constructing an adaptor and would
assume that all attributes were discardable, which is not the case.
---
 mlir/lib/Interfaces/InferTypeOpInterface.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Interfaces/InferTypeOpInterface.cpp b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
index 3c50c4c37c6f5..ee4c0519b9f54 100644
--- a/mlir/lib/Interfaces/InferTypeOpInterface.cpp
+++ b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
@@ -240,8 +240,9 @@ LogicalResult mlir::detail::verifyInferredResultTypes(Operation *op) {
   auto retTypeFn = cast<InferTypeOpInterface>(op);
   auto result = retTypeFn.refineReturnTypes(
       op->getContext(), op->getLoc(), op->getOperands(),
-      op->getDiscardableAttrDictionary(), op->getPropertiesStorage(),
-      op->getRegions(), inferredReturnTypes);
+      op->getPropertiesStorage() ? op->getDiscardableAttrDictionary()
+                                 : op->getAttrDictionary(),
+      op->getPropertiesStorage(), op->getRegions(), inferredReturnTypes);
   if (failed(result))
     op->emitOpError() << "failed to infer returned types";
 

From a3977c92376ecec1838262eca68d0def14a4e14d Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Wed, 3 Jan 2024 13:52:50 -0300
Subject: [PATCH 149/313] [clangd] check for synthesized symbols when tracking
 include locations (#75128)

This fixes https://github.com/llvm/llvm-project/issues/75115

In C mode with MSVC compatibility, when the `assert` macro is defined,
as a workaround, `static_assert` is implicitly defined as well, if not
already so, in order to work around a broken `assert.h` implementation.
This workaround was implemented in
https://github.com/llvm/llvm-project/commit/8da090381d567d0ec555840f6b2a651d2997e4b3

A synthesized symbol does not occur in source code, and therefore should
not have valid source location, but this was not checked when inserting
this into a `symbol -> include file` map.

The invalid FileID value is used for empty key representation in the
include file hash table, so it's not valid to insert it.
---
 .../clangd/index/SymbolCollector.cpp              |  8 ++++----
 clang-tools-extra/clangd/test/GH75115.test        | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)
 create mode 100644 clang-tools-extra/clangd/test/GH75115.test

diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index 7ef4b15febad2..bf838e53f2a21 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -821,7 +821,8 @@ void SymbolCollector::setIncludeLocation(const Symbol &S, SourceLocation DefLoc,
 
   // Use the expansion location to get the #include header since this is
   // where the symbol is exposed.
-  IncludeFiles[S.ID] = SM.getDecomposedExpansionLoc(DefLoc).first;
+  if (FileID FID = SM.getDecomposedExpansionLoc(DefLoc).first; FID.isValid())
+    IncludeFiles[S.ID] = FID;
 
   // We update providers for a symbol with each occurence, as SymbolCollector
   // might run while parsing, rather than at the end of a translation unit.
@@ -879,16 +880,15 @@ void SymbolCollector::finish() {
     const Symbol *S = Symbols.find(SID);
     if (!S)
       continue;
-    assert(IncludeFiles.contains(SID));
 
-    const auto FID = IncludeFiles.at(SID);
+    FileID FID = IncludeFiles.lookup(SID);
     // Determine if the FID is #include'd or #import'ed.
     Symbol::IncludeDirective Directives = Symbol::Invalid;
     auto CollectDirectives = shouldCollectIncludePath(S->SymInfo.Kind);
     if ((CollectDirectives & Symbol::Include) != 0)
       Directives |= Symbol::Include;
     // Only allow #import for symbols from ObjC-like files.
-    if ((CollectDirectives & Symbol::Import) != 0) {
+    if ((CollectDirectives & Symbol::Import) != 0 && FID.isValid()) {
       auto [It, Inserted] = FileToContainsImportsOrObjC.try_emplace(FID);
       if (Inserted)
         It->second = FilesWithObjCConstructs.contains(FID) ||
diff --git a/clang-tools-extra/clangd/test/GH75115.test b/clang-tools-extra/clangd/test/GH75115.test
new file mode 100644
index 0000000000000..dc86f5b9a6cee
--- /dev/null
+++ b/clang-tools-extra/clangd/test/GH75115.test
@@ -0,0 +1,15 @@
+// RUN: rm -rf %t.dir && mkdir -p %t.dir
+// RUN: echo '[{"directory": "%/t.dir", "command": "clang --target=x86_64-pc-windows-msvc -x c GH75115.test", "file": "GH75115.test"}]' > %t.dir/compile_commands.json
+// RUN: clangd -enable-config=0 --compile-commands-dir=%t.dir -check=%s 2>&1 | FileCheck -strict-whitespace %s
+
+// CHECK: Building preamble...
+// CHECK-NEXT: Built preamble
+// CHECK-NEXT: Indexing headers...
+// CHECK-NEXT: Building AST...
+// CHECK-NEXT: Indexing AST...
+// CHECK-NEXT: Building inlay hints
+// CHECK-NEXT: semantic highlighting
+// CHECK-NEXT: Testing features at each token
+// CHECK-NEXT: All checks completed, 0 errors
+
+#define assert

From 0faf46befa7c07e58034ef8c6a7cd4bd5715de0a Mon Sep 17 00:00:00 2001
From: Wei Wang <apollo.mobility@gmail.com>
Date: Wed, 3 Jan 2024 08:55:38 -0800
Subject: [PATCH 150/313] [coroutines][DPValue] Update DILocation in DPValue
 for hoisted dbg.declare (#76765)

Follow up #75402 to cover DPValue
---
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 529f7309a1a27..89a1ad2243c84 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -2953,6 +2953,9 @@ void coro::salvageDebugInfo(
     std::optional<BasicBlock::iterator> InsertPt;
     if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
+      // Update DILocation only in O0 since it is easy to get out of sync in
+      // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
+      // an example.
       if (!OptimizeFrame && I->getDebugLoc())
         DVI.setDebugLoc(I->getDebugLoc());
     } else if (isa<Argument>(Storage))
@@ -2988,9 +2991,14 @@ void coro::salvageDebugInfo(
   // dbg.declare does.
   if (DPV.getType() == DPValue::LocationType::Declare) {
     std::optional<BasicBlock::iterator> InsertPt;
-    if (auto *I = dyn_cast<Instruction>(Storage))
+    if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
-    else if (isa<Argument>(Storage))
+      // Update DILocation only in O0 since it is easy to get out of sync in
+      // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
+      // an example.
+      if (!OptimizeFrame && I->getDebugLoc())
+        DPV.setDebugLoc(I->getDebugLoc());
+    } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
     if (InsertPt) {
       DPV.removeFromParent();

From cba217a9138aa8ea8d18111b648acde8f52ada8d Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Wed, 3 Jan 2024 17:56:07 +0100
Subject: [PATCH 151/313] Revert "Reapply "[Sema] Fix crash on invalid code
 with parenthesized aggregate initialization" (#76272)"

This reverts commit 02347fc7191ff4d073f439dde6523add3f5496de.

These changes break the libc++ CI again.
I'll look at a better solution later.
---
 clang/lib/Sema/SemaInit.cpp                   |  8 ------
 clang/test/SemaCXX/crash-GH76228.cpp          | 28 -------------------
 clang/test/SemaCXX/paren-list-agg-init.cpp    |  2 +-
 .../transform_error.mandates.verify.cpp       |  8 ++++++
 .../transform_error.mandates.verify.cpp       |  8 ++++++
 5 files changed, 17 insertions(+), 37 deletions(-)
 delete mode 100644 clang/test/SemaCXX/crash-GH76228.cpp

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index cc9db5ded1149..61d244f3bb979 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -5512,14 +5512,6 @@ static void TryOrBuildParenListInitialization(
   } else if (auto *RT = Entity.getType()->getAs<RecordType>()) {
     bool IsUnion = RT->isUnionType();
     const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getDecl());
-    if (RD->isInvalidDecl()) {
-      // Exit early to avoid confusion when processing members.
-      // We do the same for braced list initialization in
-      // `CheckStructUnionTypes`.
-      Sequence.SetFailed(
-          clang::InitializationSequence::FK_ParenthesizedListInitFailed);
-      return;
-    }
 
     if (!IsUnion) {
       for (const CXXBaseSpecifier &Base : RD->bases()) {
diff --git a/clang/test/SemaCXX/crash-GH76228.cpp b/clang/test/SemaCXX/crash-GH76228.cpp
deleted file mode 100644
index 33a9395823127..0000000000000
--- a/clang/test/SemaCXX/crash-GH76228.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: %clang_cc1 -std=c++20 -verify %s
-// Check we don't crash on incomplete members and bases when handling parenthesized initialization.
-class incomplete; // expected-note@-0 3  {{forward declaration of 'incomplete'}}
-struct foo {
-  int a;
-  incomplete b;
-  // expected-error@-1 {{incomplete type}}
-};
-foo a1(0);
-
-struct one_int {
-    int a;
-};
-struct bar : one_int, incomplete {};
-// expected-error@-1 {{incomplete type}}
-bar a2(0);
-
-incomplete a3[3](1,2,3);
-// expected-error@-1 {{incomplete type}}
-
-struct qux : foo {
-};
-qux a4(0);
-
-struct fred {
-    foo a[3];
-};
-fred a5(0);
diff --git a/clang/test/SemaCXX/paren-list-agg-init.cpp b/clang/test/SemaCXX/paren-list-agg-init.cpp
index c1964a5a9eb00..f60b20e0d4656 100644
--- a/clang/test/SemaCXX/paren-list-agg-init.cpp
+++ b/clang/test/SemaCXX/paren-list-agg-init.cpp
@@ -289,7 +289,7 @@ int test() {
   // used to crash
   S a(0, 1);
   S b(0);
-  S c(0, 0, 1);
+  S c(0, 0, 1); // beforecxx20-warning {{aggregate initialization of type 'S' from a parenthesized list of values is a C++20 extension}}
 
   S d {0, 1};
   S e {0};
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
index 3260a8cbc7f8e..965e82a7b4034 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
@@ -46,9 +46,11 @@ void test() {
   {
     std::expected<int, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
   }
 
@@ -56,21 +58,27 @@ void test() {
   {
     const std::expected<int, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 }
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
index 21dc247687918..09aa1332e9800 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
@@ -46,9 +46,11 @@ void test() {
   {
     std::expected<void, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
   }
 
@@ -56,21 +58,27 @@ void test() {
   {
     const std::expected<void, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
+    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 }
 // clang-format on

From a8f43974260ec244d78336d2530f8fc097753580 Mon Sep 17 00:00:00 2001
From: Yuxuan Chen <yuxuanchen1997@outlook.com>
Date: Wed, 3 Jan 2024 09:01:19 -0800
Subject: [PATCH 152/313] [Clang] Fix ICE where C++ Template Instantiation
 failed to handle attributed lambdas (#76523)

This PR is proposing a fix for
https://github.com/llvm/llvm-project/issues/76521.

Clang used to assume that during template instantiation, Lambda
expressions can only have `FunctionProtoTypeLoc`s. However, this is not
true for certain attributes like `__attribute__((pcs("aapcs-vfp")))`,
whose interpretation happens after template instantiation. This PR
changes the transformation logic for lambdas.
---
 clang/docs/ReleaseNotes.rst                   |  3 +
 clang/lib/Sema/TreeTransform.h                | 71 ++++++++++++++-----
 clang/test/SemaCXX/template-instantiation.cpp | 15 ++++
 3 files changed, 72 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/SemaCXX/template-instantiation.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e00fc5ba79169..a3107c4a69532 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -856,6 +856,9 @@ Bug Fixes to AST Handling
 - Fixed a bug where RecursiveASTVisitor fails to visit the
   initializer of a bitfield.
   `Issue 64916 <https://github.com/llvm/llvm-project/issues/64916>`_
+- Fixed a bug where Template Instantiation failed to handle Lambda Expressions
+  with certain types of Attributes.
+  (`#76521 <https://github.com/llvm/llvm-project/issues/76521>`_)
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 7df5bf0cb7137..5e9b518457070 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -674,6 +674,10 @@ class TreeTransform {
                                       Qualifiers ThisTypeQuals,
                                       Fn TransformExceptionSpec);
 
+  template <typename Fn>
+  QualType TransformAttributedType(TypeLocBuilder &TLB, AttributedTypeLoc TL,
+                                   Fn TransformModifiedType);
+
   bool TransformExceptionSpec(SourceLocation Loc,
                               FunctionProtoType::ExceptionSpecInfo &ESI,
                               SmallVectorImpl<QualType> &Exceptions,
@@ -7050,12 +7054,12 @@ TreeTransform<Derived>::TransformElaboratedType(TypeLocBuilder &TLB,
   return Result;
 }
 
-template<typename Derived>
+template <typename Derived>
+template <typename Fn>
 QualType TreeTransform<Derived>::TransformAttributedType(
-                                                TypeLocBuilder &TLB,
-                                                AttributedTypeLoc TL) {
+    TypeLocBuilder &TLB, AttributedTypeLoc TL, Fn TransformModifiedTypeFn) {
   const AttributedType *oldType = TL.getTypePtr();
-  QualType modifiedType = getDerived().TransformType(TLB, TL.getModifiedLoc());
+  QualType modifiedType = TransformModifiedTypeFn(TLB, TL.getModifiedLoc());
   if (modifiedType.isNull())
     return QualType();
 
@@ -7099,6 +7103,15 @@ QualType TreeTransform<Derived>::TransformAttributedType(
   return result;
 }
 
+template <typename Derived>
+QualType TreeTransform<Derived>::TransformAttributedType(TypeLocBuilder &TLB,
+                                                         AttributedTypeLoc TL) {
+  return getDerived().TransformAttributedType(
+      TLB, TL, [&](TypeLocBuilder &TLB, TypeLoc ModifiedLoc) -> QualType {
+        return getDerived().TransformType(TLB, ModifiedLoc);
+      });
+}
+
 template <typename Derived>
 QualType TreeTransform<Derived>::TransformBTFTagAttributedType(
     TypeLocBuilder &TLB, BTFTagAttributedTypeLoc TL) {
@@ -13600,32 +13613,56 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
   // transformed parameters.
   TypeSourceInfo *NewCallOpTSI = nullptr;
   {
-    TypeSourceInfo *OldCallOpTSI = E->getCallOperator()->getTypeSourceInfo();
-    auto OldCallOpFPTL =
-        OldCallOpTSI->getTypeLoc().getAs<FunctionProtoTypeLoc>();
+    auto OldCallOpTypeLoc =
+        E->getCallOperator()->getTypeSourceInfo()->getTypeLoc();
+
+    auto TransformFunctionProtoTypeLoc =
+        [this](TypeLocBuilder &TLB, FunctionProtoTypeLoc FPTL) -> QualType {
+      SmallVector<QualType, 4> ExceptionStorage;
+      TreeTransform *This = this; // Work around gcc.gnu.org/PR56135.
+      return this->TransformFunctionProtoType(
+          TLB, FPTL, nullptr, Qualifiers(),
+          [&](FunctionProtoType::ExceptionSpecInfo &ESI, bool &Changed) {
+            return This->TransformExceptionSpec(FPTL.getBeginLoc(), ESI,
+                                                ExceptionStorage, Changed);
+          });
+    };
 
+    QualType NewCallOpType;
     TypeLocBuilder NewCallOpTLBuilder;
-    SmallVector<QualType, 4> ExceptionStorage;
-    TreeTransform *This = this; // Work around gcc.gnu.org/PR56135.
-    QualType NewCallOpType = TransformFunctionProtoType(
-        NewCallOpTLBuilder, OldCallOpFPTL, nullptr, Qualifiers(),
-        [&](FunctionProtoType::ExceptionSpecInfo &ESI, bool &Changed) {
-          return This->TransformExceptionSpec(OldCallOpFPTL.getBeginLoc(), ESI,
-                                              ExceptionStorage, Changed);
-        });
+
+    if (auto ATL = OldCallOpTypeLoc.getAs<AttributedTypeLoc>()) {
+      NewCallOpType = this->TransformAttributedType(
+          NewCallOpTLBuilder, ATL,
+          [&](TypeLocBuilder &TLB, TypeLoc TL) -> QualType {
+            return TransformFunctionProtoTypeLoc(
+                TLB, TL.castAs<FunctionProtoTypeLoc>());
+          });
+    } else {
+      auto FPTL = OldCallOpTypeLoc.castAs<FunctionProtoTypeLoc>();
+      NewCallOpType = TransformFunctionProtoTypeLoc(NewCallOpTLBuilder, FPTL);
+    }
+
     if (NewCallOpType.isNull())
       return ExprError();
     NewCallOpTSI =
         NewCallOpTLBuilder.getTypeSourceInfo(getSema().Context, NewCallOpType);
   }
 
+  ArrayRef<ParmVarDecl *> Params;
+  if (auto ATL = NewCallOpTSI->getTypeLoc().getAs<AttributedTypeLoc>()) {
+    Params = ATL.getModifiedLoc().castAs<FunctionProtoTypeLoc>().getParams();
+  } else {
+    auto FPTL = NewCallOpTSI->getTypeLoc().castAs<FunctionProtoTypeLoc>();
+    Params = FPTL.getParams();
+  }
+
   getSema().CompleteLambdaCallOperator(
       NewCallOperator, E->getCallOperator()->getLocation(),
       E->getCallOperator()->getInnerLocStart(),
       E->getCallOperator()->getTrailingRequiresClause(), NewCallOpTSI,
       E->getCallOperator()->getConstexprKind(),
-      E->getCallOperator()->getStorageClass(),
-      NewCallOpTSI->getTypeLoc().castAs<FunctionProtoTypeLoc>().getParams(),
+      E->getCallOperator()->getStorageClass(), Params,
       E->hasExplicitResultType());
 
   getDerived().transformAttrs(E->getCallOperator(), NewCallOperator);
diff --git a/clang/test/SemaCXX/template-instantiation.cpp b/clang/test/SemaCXX/template-instantiation.cpp
new file mode 100644
index 0000000000000..e714e070a206f
--- /dev/null
+++ b/clang/test/SemaCXX/template-instantiation.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -verify -fsyntax-only -Wno-ignored-attributes %s
+// expected-no-diagnostics
+
+namespace GH76521 {
+
+template <typename T>
+void foo() {
+  auto l = []() __attribute__((pcs("aapcs-vfp"))) {};
+}
+
+void bar() {
+  foo<int>();
+}
+
+}

From ece1359857c547a156ed743643bccbfd0f09bf2a Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 3 Jan 2024 16:54:45 +0000
Subject: [PATCH 153/313] Revert "[PowerPC] Add test after #75271 on PPC. NFC.
 (#75616)"

This reverts commit 5cfc7b3342ce4de0bbe182b38baa8a71fc83f8f8.

This depends on 0e46b49de43349f8cbb2a7d4c6badef6d16e31ae which is being reverted.
---
 ...-remat-with-undef-implicit-def-operand.mir | 28 -------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir

diff --git a/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir b/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir
deleted file mode 100644
index 8e4e3be55600f..0000000000000
--- a/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir
+++ /dev/null
@@ -1,28 +0,0 @@
-# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-coalescing -run-pass=register-coalescer \
-# RUN:   -o - %s | FileCheck %s
----
-name:            _Z13testTransposeIfLj31ELj17EEvv
-alignment:       16
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    128
-machineFunctionInfo: {}
-body:             |
-  ; CHECK-LABEL: name:            _Z13testTransposeIfLj31ELj17EEvv
-  ; CHECK: undef %[[REG:[0-9]+]].sub_64:vsrc = IMPLICIT_DEF implicit-def %[[REG]]
-  bb.0:
-    liveins: $x2
-    %2:vssrc = IMPLICIT_DEF
-    B %bb.2
-
-  bb.1:
-    %0:vsrc = SUBREG_TO_REG 1, killed %2, %subreg.sub_64
-    %1:vsrc = XXPERMDI killed undef %0, killed %0, 0
-    BLR8 implicit $lr8, implicit $rm
-
-  bb.2:
-    successors: %bb.2(0x7c000000), %bb.1(0x04000000)
-    BDNZ8 %bb.2, implicit-def $ctr8, implicit $ctr8
-    B %bb.1
-
-...

From c4146121e940b6b853148c780568dee38b97382f Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 3 Jan 2024 16:50:32 +0000
Subject: [PATCH 154/313] Revert "Reapply "RegisterCoalescer: Add implicit-def
 of super register when coalescing SUBREG_TO_REG""

This reverts commit 0e46b49de43349f8cbb2a7d4c6badef6d16e31ae.

Causes crashes, see repro on https://github.com/llvm/llvm-project/commit/0e46b49de43349f8cbb2a7d4c6badef6d16e31ae.
---
 llvm/lib/CodeGen/RegisterCoalescer.cpp        |  51 +--
 .../AArch64/GlobalISel/arm64-pcsections.ll    |  92 ++---
 ...coalescer-breaks-subreg-to-reg-liveness.ll | 185 ----------
 ...icit-def-regression-imp-operand-assert.mir |   4 +-
 ...subreg-to-reg-requires-subrange-update.mir |  47 ---
 .../CodeGen/X86/subreg-to-reg-coalescing.mir  | 348 ------------------
 6 files changed, 57 insertions(+), 670 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
 delete mode 100644 llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
 delete mode 100644 llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 3fbb93795075d..cbb1a74049fbd 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -305,11 +305,7 @@ namespace {
     /// number if it is not zero. If DstReg is a physical register and the
     /// existing subregister number of the def / use being updated is not zero,
     /// make sure to set it to the correct physical subregister.
-    ///
-    /// If \p IsSubregToReg, we are coalescing a DstReg = SUBREG_TO_REG
-    /// SrcReg. This introduces an implicit-def of DstReg on coalesced users.
-    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx,
-                           bool IsSubregToReg);
+    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
 
     /// If the given machine operand reads only undefined lanes add an undef
     /// flag.
@@ -1347,7 +1343,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     if (DstReg.isPhysical()) {
       Register NewDstReg = DstReg;
 
-      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx);
+      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
+                                              DefMI->getOperand(0).getSubReg());
       if (NewDstIdx)
         NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
 
@@ -1496,7 +1493,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     MRI->setRegClass(DstReg, NewRC);
 
     // Update machine operands and add flags.
-    updateRegDefsUses(DstReg, DstReg, DstIdx, false);
+    updateRegDefsUses(DstReg, DstReg, DstIdx);
     NewMI.getOperand(0).setSubReg(NewIdx);
     // updateRegDefUses can add an "undef" flag to the definition, since
     // it will replace DstReg with DstReg.DstIdx. If NewIdx is 0, make
@@ -1816,7 +1813,7 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
 }
 
 void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
-                                          unsigned SubIdx, bool IsSubregToReg) {
+                                          unsigned SubIdx) {
   bool DstIsPhys = DstReg.isPhysical();
   LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
 
@@ -1856,8 +1853,6 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
     if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
       Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
-    bool FullDef = true;
-
     // Replace SrcReg with DstReg in all UseMI operands.
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       MachineOperand &MO = UseMI->getOperand(Ops[i]);
@@ -1865,13 +1860,9 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
       // Adjust <undef> flags in case of sub-register joins. We don't want to
       // turn a full def into a read-modify-write sub-register def and vice
       // versa.
-      if (SubIdx && MO.isDef()) {
+      if (SubIdx && MO.isDef())
         MO.setIsUndef(!Reads);
 
-        if (!Reads)
-          FullDef = false;
-      }
-
       // A subreg use of a partially undef (super) register may be a complete
       // undef use now and then has to be marked that way.
       if (MO.isUse() && !DstIsPhys) {
@@ -1903,25 +1894,6 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
         MO.substVirtReg(DstReg, SubIdx, *TRI);
     }
 
-    if (IsSubregToReg && !FullDef) {
-      // If the coalesed instruction doesn't fully define the register, we need
-      // to preserve the original super register liveness for SUBREG_TO_REG.
-      //
-      // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes,
-      // but it introduces liveness for other subregisters. Downstream users may
-      // have been relying on those bits, so we need to ensure their liveness is
-      // captured with a def of other lanes.
-
-      // FIXME: Need to add new subrange if tracking subranges. We could also
-      // skip adding this if we knew the other lanes are dead, and only for
-      // other lanes.
-
-      assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
-             "this should update subranges");
-      MachineInstrBuilder MIB(*MF, UseMI);
-      MIB.addReg(DstReg, RegState::ImplicitDefine);
-    }
-
     LLVM_DEBUG({
       dbgs() << "\t\tupdated: ";
       if (!UseMI->isDebugInstr())
@@ -2121,8 +2093,6 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     });
   }
 
-  const bool IsSubregToReg = CopyMI->isSubregToReg();
-
   ShrinkMask = LaneBitmask::getNone();
   ShrinkMainRange = false;
 
@@ -2190,12 +2160,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   // Rewrite all SrcReg operands to DstReg.
   // Also update DstReg operands to include DstIdx if it is set.
-  if (CP.getDstIdx()) {
-    assert(!IsSubregToReg && "can this happen?");
-    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx(), false);
-  }
-  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(),
-                    IsSubregToReg);
+  if (CP.getDstIdx())
+    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
 
   // Shrink subregister ranges if necessary.
   if (ShrinkMask.any()) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index 8529dd388ba0f..4c07081404c88 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -13,7 +13,7 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -47,13 +47,13 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $x0, $x2
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def renamable $x9, pcsections !0 :: (load (s32) from %ir.pnew)
+  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def $x9, pcsections !0 :: (load (s32) from %ir.pnew)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cmpxchg.start:
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0, $x9
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -93,7 +93,7 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -249,7 +249,7 @@ define i32 @fetch_and_nand(ptr %p) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w8, 2, pcsections !0
   ; CHECK-NEXT:   $w9 = ORNWrs $wzr, killed renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRW killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p)
@@ -302,7 +302,7 @@ define i32 @fetch_and_or(ptr %p) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $w10 = ORRWrs renamable $w8, renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRW killed renamable $w10, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
@@ -735,8 +735,8 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -761,7 +761,7 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0, $x1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRB renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -785,8 +785,8 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -810,8 +810,8 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -835,8 +835,8 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -860,8 +860,8 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -885,10 +885,10 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -912,10 +912,10 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -940,10 +940,10 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -968,10 +968,10 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -995,8 +995,8 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1021,7 +1021,7 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0, $x1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRH renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1045,8 +1045,8 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1070,8 +1070,8 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1095,8 +1095,8 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1120,8 +1120,8 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1145,10 +1145,10 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1172,10 +1172,10 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1200,10 +1200,10 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1228,10 +1228,10 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1257,7 +1257,7 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def renamable $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.4, implicit killed $nzcv, pcsections !0
@@ -1300,7 +1300,7 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def renamable $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 8, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.4, implicit killed $nzcv, pcsections !0
diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
deleted file mode 100644
index a3c3fc70e9761..0000000000000
--- a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
+++ /dev/null
@@ -1,185 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=x86_64-grtev4-linux-gnu < %s | FileCheck %s
-
-%struct.wibble = type { %struct.wombat }
-%struct.wombat = type { %struct.ham, [3 x i8] }
-%struct.ham = type { %struct.zot }
-%struct.zot = type { %struct.blam }
-%struct.blam = type { %struct.ham.0 }
-%struct.ham.0 = type { %struct.bar }
-%struct.bar = type { %struct.bar.1 }
-%struct.bar.1 = type { %struct.baz, i8 }
-%struct.baz = type { %struct.snork }
-%struct.snork = type <{ %struct.spam, i8, [3 x i8] }>
-%struct.spam = type { %struct.snork.2, %struct.snork.2 }
-%struct.snork.2 = type { i32 }
-%struct.snork.3 = type { %struct.baz, i8, [3 x i8] }
-
-define void @foo(ptr %arg, ptr %arg1, i40 %arg2, ptr %arg3, i32 %arg4) #0 {
-; CHECK-LABEL: foo:
-; CHECK:       # %bb.0: # %bb
-; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    .cfi_def_cfa_register %rbp
-; CHECK-NEXT:    pushq %r15
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    pushq %r13
-; CHECK-NEXT:    pushq %r12
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_offset %rbx, -56
-; CHECK-NEXT:    .cfi_offset %r12, -48
-; CHECK-NEXT:    .cfi_offset %r13, -40
-; CHECK-NEXT:    .cfi_offset %r14, -32
-; CHECK-NEXT:    .cfi_offset %r15, -24
-; CHECK-NEXT:    movl %r8d, %r14d
-; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq %rsi, %r13
-; CHECK-NEXT:    movq %rdi, %r15
-; CHECK-NEXT:    incl %r14d
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    # implicit-def: $r12
-; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_1: # %bb17
-; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    movq %r15, %r13
-; CHECK-NEXT:    xorl %r15d, %r15d
-; CHECK-NEXT:    testq %rbx, %rbx
-; CHECK-NEXT:    sete %r15b
-; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    callq _Znwm@PLT
-; CHECK-NEXT:    shll $4, %r15d
-; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; CHECK-NEXT:    movq %r12, %rcx
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    movb %cl, 12(%rax)
-; CHECK-NEXT:    movl %r12d, 8(%rax)
-; CHECK-NEXT:    movq %r15, %rbx
-; CHECK-NEXT:    movq %r13, %r15
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; CHECK-NEXT:    decl %r14d
-; CHECK-NEXT:    je .LBB0_8
-; CHECK-NEXT:  .LBB0_3: # %bb7
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    callq widget@PLT
-; CHECK-NEXT:    cmpb $-5, (%r13)
-; CHECK-NEXT:    jae .LBB0_5
-; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    movl %r12d, %r12d
-; CHECK-NEXT:    cmpq %r15, %rbx
-; CHECK-NEXT:    jbe .LBB0_1
-; CHECK-NEXT:    jmp .LBB0_7
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_5: # %bb12
-; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    movq 0, %rax
-; CHECK-NEXT:    movq 8, %rax
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; CHECK-NEXT:    cmpq %r15, %rbx
-; CHECK-NEXT:    jbe .LBB0_1
-; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    decl %r14d
-; CHECK-NEXT:    jne .LBB0_3
-; CHECK-NEXT:  .LBB0_8: # %bb21
-; CHECK-NEXT:    cmpb $0, 12(%rax)
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  # %bb.9: # %bb26
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    popq %r12
-; CHECK-NEXT:    popq %r13
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
-; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB0_10: # %bb25
-; CHECK-NEXT:    .cfi_def_cfa %rbp, 16
-; CHECK-NEXT:    movq %r15, %rdi
-; CHECK-NEXT:    callq pluto@PLT
-bb:
-  br label %bb7
-
-bb5:                                              ; preds = %bb17, %bb14
-  %phi = phi ptr [ %call19, %bb17 ], [ null, %bb14 ]
-  %phi6 = phi ptr [ %getelementptr, %bb17 ], [ null, %bb14 ]
-  %add = add i32 %phi9, 1
-  %icmp = icmp eq i32 %phi9, %arg4
-  br i1 %icmp, label %bb21, label %bb7
-
-bb7:                                              ; preds = %bb5, %bb
-  %phi8 = phi ptr [ null, %bb ], [ %phi6, %bb5 ]
-  %phi9 = phi i32 [ 0, %bb ], [ %add, %bb5 ]
-  %phi10 = phi i40 [ undef, %bb ], [ %phi15, %bb5 ]
-  %call = call ptr @widget()
-  %load = load i8, ptr %arg1, align 8
-  %icmp11 = icmp ult i8 %load, -5
-  %and = and i40 %phi10, 4294967295
-  br i1 %icmp11, label %bb14, label %bb12
-
-bb12:                                             ; preds = %bb7
-  %load13 = load volatile { i64, i64 }, ptr null, align 4294967296
-  br label %bb14
-
-bb14:                                             ; preds = %bb12, %bb7
-  %phi15 = phi i40 [ %and, %bb7 ], [ %arg2, %bb12 ]
-  %icmp16 = icmp ugt ptr %phi8, %arg
-  br i1 %icmp16, label %bb5, label %bb17
-
-bb17:                                             ; preds = %bb14
-  %icmp18 = icmp eq ptr %phi8, null
-  %zext = zext i1 %icmp18 to i64
-  %call19 = call ptr @_Znwm(i64 0)
-  %getelementptr = getelementptr %struct.wibble, ptr %arg3, i64 %zext
-  %getelementptr20 = getelementptr i8, ptr %call19, i64 8
-  store i40 %phi15, ptr %getelementptr20, align 4
-  br label %bb5
-
-bb21:                                             ; preds = %bb5
-  %getelementptr22 = getelementptr %struct.snork.3, ptr %phi, i64 0, i32 1
-  %load23 = load i8, ptr %getelementptr22, align 4
-  %icmp24 = icmp eq i8 %load23, 0
-  br i1 %icmp24, label %bb26, label %bb25
-
-bb25:                                             ; preds = %bb21
-  call void @pluto(ptr %arg)
-  unreachable
-
-bb26:                                             ; preds = %bb21
-  ret void
-}
-
-define void @eggs(ptr %arg, ptr %arg1) {
-; CHECK-LABEL: eggs:
-; CHECK:       # %bb.0: # %bb
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rsi, %rdi
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    xorl %r8d, %r8d
-; CHECK-NEXT:    callq foo@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
-bb:
-  call void @foo(ptr %arg1, ptr %arg, i40 0, ptr null, i32 0)
-  ret void
-}
-
-declare ptr @widget()
-
-declare void @pluto(ptr)
-
-declare ptr @_Znwm(i64)
-
-attributes #0 = { noinline "frame-pointer"="all" }
diff --git a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
index 190b14052d9b6..8241a1757af52 100644
--- a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
+++ b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
@@ -9,7 +9,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x2aaaaaab), %bb.2(0x55555555)
   ; CHECK-NEXT:   liveins: $edi
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_]]
+  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
   ; CHECK-NEXT:   JCC_1 %bb.2, 5, implicit killed undef $eflags
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -28,7 +28,7 @@ body:             |
   ; CHECK-NEXT:   JCC_1 %bb.5, 5, implicit killed undef $eflags
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $al, implicit-def $al
+  ; CHECK-NEXT:   dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $al
   ; CHECK-NEXT:   RET 0, killed undef $al
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
diff --git a/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir b/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
deleted file mode 100644
index fe53aef86e835..0000000000000
--- a/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
+++ /dev/null
@@ -1,47 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -enable-subreg-liveness -verify-coalescing -o - %s | FileCheck %s
-
-
-# FIXME: Need to handle subrange updates when coalescing with subreg_to_reg
-# This will fail if x86 enables subregister liveness.
----
-name: requires_new_subrange_coalesce_subreg_to_reg
-tracksRegLiveness: true
-body:             |
-  ; CHECK-LABEL: name: requires_new_subrange_coalesce_subreg_to_reg
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $eax
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = COPY $eax
-  ; CHECK-NEXT:   %b:gr32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
-  ; CHECK-NEXT:   JCC_1 %bb.2, 4, implicit undef $eflags
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
-  ; CHECK-NEXT:   %c.sub_32bit:gr64 = COPY %a
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   %c.sub_32bit:gr64 = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
-  ; CHECK-NEXT:   RET 0, implicit %c
-  bb.0:
-    liveins: $eax
-    %init_eax:gr32 = COPY $eax
-    %a:gr64 = SUBREG_TO_REG 0, %init_eax, %subreg.sub_32bit
-    %b:gr32 = IMPLICIT_DEF
-    %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
-    JCC_1 %bb.2, 4, implicit undef $eflags
-
-  bb.1:
-    %imm0:gr32 = MOV32r0 implicit-def dead $eflags
-    %a = SUBREG_TO_REG 0, %imm0, %subreg.sub_32bit
-    %c.sub_32bit = COPY %a
-
-  bb.2:
-    %c.sub_32bit = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
-    RET 0, implicit %c
-
-...
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir b/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir
deleted file mode 100644
index 6121a0bcc5641..0000000000000
--- a/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir
+++ /dev/null
@@ -1,348 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -o - %s | FileCheck %s
-
-# We cannot lose the liveness of the high subregister of %1 when
-# coalesced with %0, so introduce an implicit-def of the super
-# register on the MOV.
-
----
-name: coalesce_mov32r0_into_subreg_to_reg64
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64
-    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
-    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
-    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: RET 0
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    %0:gr32 = MOV32r0 implicit-def dead $eflags
-    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
-    $rdi = COPY %1
-    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-...
-
----
-name: subreg_to_reg_folds_to_undef
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $rax
-
-    ; CHECK-LABEL: name: subreg_to_reg_folds_to_undef
-    ; CHECK: liveins: $rax
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY $rax
-    ; CHECK-NEXT: undef %4.sub_32bit:gr64_with_sub_8bit = MOV32rr [[COPY]].sub_32bit, implicit-def %4
-    ; CHECK-NEXT: RET 0, implicit %4
-    %0:gr64 = COPY killed $rax
-    %1:gr32 = COPY killed %0.sub_32bit
-    %2:gr32 = MOV32rr killed %1
-    %3:gr64 = SUBREG_TO_REG 0, killed %2, %subreg.sub_32bit
-    %4:gr64 = COPY killed %3
-    RET 0, implicit %4
-
-...
-
----
-name: coalesce_mov32r0_subreg_def_into_subreg_to_reg64
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: coalesce_mov32r0_subreg_def_into_subreg_to_reg64
-    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
-    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
-    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: RET 0
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    undef %0.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
-    %1:gr64 = SUBREG_TO_REG 0, killed %0.sub_32bit, %subreg.sub_32bit
-    $rdi = COPY %1
-    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-...
-
----
-name: coalesce_mov32r0_into_subreg_def_with_super_def_to_reg64
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_def_with_super_def_to_reg64
-    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
-    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
-    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: RET 0
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    undef %0.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %0
-    %1:gr64 = SUBREG_TO_REG 0, killed %0.sub_32bit, %subreg.sub_32bit
-    $rdi = COPY %1
-    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-...
-
----
-name: coalesce_mov32r0_into_subreg_to_reg64_already_defs_other_subreg
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_already_defs_other_subreg
-    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def undef %1.sub_8bit, implicit-def %1
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, implicit %1
-    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit undef $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: RET 0
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    %0:gr32 = MOV32r0 implicit-def dead $eflags, implicit-def undef %0.sub_8bit
-    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
-    INLINEASM &"", 0, implicit %1
-    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit undef $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-...
-
-
-# Reduced realistic case which was asserting after introducing new implicit-defs
----
-name: coalesce_needs_implicit_defs
-tracksRegLiveness: true
-body:             |
-  ; CHECK-LABEL: name: coalesce_needs_implicit_defs
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $rdi
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
-  ; CHECK-NEXT:   undef %2.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %2
-  ; CHECK-NEXT:   undef %3.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %3
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %10.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
-  ; CHECK-NEXT:   TEST64rr %3, %3, implicit-def $eflags
-  ; CHECK-NEXT:   %10.sub_8bit:gr64_with_sub_8bit = SETCCr 4, implicit killed $eflags
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
-  ; CHECK-NEXT:   CALL64r %2, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   [[SHL64ri:%[0-9]+]]:gr64_with_sub_8bit = SHL64ri [[SHL64ri]], 4, implicit-def dead $eflags
-  ; CHECK-NEXT:   [[ADD64rr:%[0-9]+]]:gr64_with_sub_8bit = ADD64rr [[ADD64rr]], [[COPY]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_with_sub_8bit = COPY [[ADD64rr]]
-  ; CHECK-NEXT:   JMP_1 %bb.1
-  bb.0:
-    liveins: $rdi
-
-    %0:gr64 = COPY killed $rdi
-    %1:gr32 = MOV32r0 implicit-def dead $eflags
-    %2:gr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32bit
-    %3:gr64 = COPY killed %2
-
-  bb.1:
-    %4:gr64 = COPY killed %3
-    %5:gr32 = MOV32r0 implicit-def dead $eflags
-    TEST64rr killed %4, %4, implicit-def $eflags
-    %6:gr8 = SETCCr 4, implicit killed $eflags
-    %7:gr32 = COPY killed %5
-    %7.sub_8bit:gr32 = COPY killed %6
-    %8:gr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32bit
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    %9:gr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32bit
-    $rdi = COPY %9
-    CALL64r killed %9, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    %10:gr64 = COPY killed %8
-    %10:gr64 = SHL64ri %10, 4, implicit-def dead $eflags
-    %11:gr64 = COPY killed %10
-    %11:gr64 = ADD64rr %11, %0, implicit-def dead $eflags
-    %3:gr64 = COPY killed %11
-    JMP_1 %bb.1
-
-...
-
----
-name: coalesce_mov32r0_into_subreg_to_reg64_physreg_def
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_physreg_def
-    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
-    ; CHECK-NEXT: CALL64r killed $rdi, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: RET 0
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    %0:gr32 = MOV32r0 implicit-def dead $eflags
-    $rdi = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
-    CALL64r killed $rdi, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-...
-
----
-name: coalesce_mov32r0_into_subreg_to_reg64_physreg_use
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $eax
-    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_physreg_use
-    ; CHECK: liveins: $eax
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: $eax = MOV32r0 implicit-def dead $eflags
-    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, $eax, %subreg.sub_32bit
-    ; CHECK-NEXT: $rdi = COPY [[SUBREG_TO_REG]]
-    ; CHECK-NEXT: CALL64r [[SUBREG_TO_REG]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: RET 0
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    $eax = MOV32r0 implicit-def dead $eflags
-    %1:gr64 = SUBREG_TO_REG 0, killed $eax, %subreg.sub_32bit
-    $rdi = COPY %1
-    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-...
-
-# Coalesced instruction is a copy with other implicit operands
----
-name: coalesce_copy_into_subreg_to_reg64
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $eax
-    ; CHECK-LABEL: name: coalesce_copy_into_subreg_to_reg64
-    ; CHECK: liveins: $eax
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = COPY $eax, implicit-def dead $eflags, implicit-def %1
-    ; CHECK-NEXT: $rdi = COPY %1
-    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: RET 0
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    %0:gr32 = COPY $eax, implicit-def dead $eflags
-    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
-    $rdi = COPY %1
-    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-...
-
----
-name: coalesce_mov32r0_into_subreg_to_reg64_multiple_redef_value
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_multiple_redef_value
-    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, implicit-def %1.sub_32bit, implicit %1.sub_32bit
-    ; CHECK-NEXT: $rdi = COPY %1
-    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ; CHECK-NEXT: RET 0
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    %0:gr32 = MOV32r0 implicit-def dead $eflags
-    INLINEASM &"", 0, implicit-def %0, implicit %0
-    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
-    $rdi = COPY %1
-    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-...
-
----
-name: coalesce_mov32r0_into_subreg_to_reg64_def_is_block_liveout
-tracksRegLiveness: true
-body:             |
-  ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_def_is_block_liveout
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"", 0 /* attdialect */, implicit-def undef %1.sub_32bit, implicit-def %1
-  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit undef $eflags
-  ; CHECK-NEXT:   JMP_1 %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   $rdi = COPY %1
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   RET 0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  bb.0:
-    INLINEASM &"", 0, implicit-def %0:gr32
-    JCC_1 %bb.1, 4, implicit undef $eflags
-    JMP_1 %bb.2
-
-  bb.1:
-    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
-    $rdi = COPY %1
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    RET 0
-
-  bb.2:
-
-...
-
----
-name: coalesce_mov32r0_into_subreg_to_reg64_def_is_phi_def
-tracksRegLiveness: true
-body:             |
-  ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_def_is_phi_def
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"", 0 /* attdialect */, implicit-def undef %1.sub_32bit, implicit-def %1
-  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit undef $eflags
-  ; CHECK-NEXT:   JMP_1 %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $rdi = COPY %1
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   JMP_1 %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  bb.0:
-
-    INLINEASM &"", 0, implicit-def %0:gr32
-    JCC_1 %bb.1, 4, implicit undef $eflags
-    JMP_1 %bb.2
-
-  bb.1:
-    %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32bit
-    $rdi = COPY %1
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    JMP_1 %bb.1
-
-  bb.2:
-
-...

From 3f9f8efc548d4a1dbd289bcf5fafe644d5f5c8f8 Mon Sep 17 00:00:00 2001
From: Hugo Melder <service@hugomelder.com>
Date: Wed, 3 Jan 2024 18:34:00 +0100
Subject: [PATCH 155/313] ObjcRuntime.h: Add mips64, aarch64, and riscv64 to
 non-legacy dispatch (#76694)

This PR updates the list of architectures for which libobjc2 has
fast-path objc_msgSend implementations.

Related to: https://github.com/gnustep/libobjc2/pull/261
---
 clang/include/clang/Basic/ObjCRuntime.h     | 28 +++++++++------
 clang/test/CodeGenObjC/messages.m           |  3 +-
 clang/test/Driver/gnustep-dispatch-method.m | 38 +++++++++++++++++++++
 3 files changed, 58 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/Driver/gnustep-dispatch-method.m

diff --git a/clang/include/clang/Basic/ObjCRuntime.h b/clang/include/clang/Basic/ObjCRuntime.h
index 500b2462f0077..f05debe6fea51 100644
--- a/clang/include/clang/Basic/ObjCRuntime.h
+++ b/clang/include/clang/Basic/ObjCRuntime.h
@@ -100,16 +100,24 @@ class ObjCRuntime {
   bool isLegacyDispatchDefaultForArch(llvm::Triple::ArchType Arch) {
     // The GNUstep runtime uses a newer dispatch method by default from
     // version 1.6 onwards
-    if (getKind() == GNUstep && getVersion() >= VersionTuple(1, 6)) {
-      if (Arch == llvm::Triple::arm ||
-          Arch == llvm::Triple::x86 ||
-          Arch == llvm::Triple::x86_64)
-        return false;
-    }
-    else if ((getKind() ==  MacOSX) && isNonFragile() &&
-             (getVersion() >= VersionTuple(10, 0)) &&
-             (getVersion() < VersionTuple(10, 6)))
-        return Arch != llvm::Triple::x86_64;
+    if (getKind() == GNUstep) {
+      switch (Arch) {
+      case llvm::Triple::arm:
+      case llvm::Triple::x86:
+      case llvm::Triple::x86_64:
+        return !(getVersion() >= VersionTuple(1, 6));
+      case llvm::Triple::aarch64:
+      case llvm::Triple::mips64:
+        return !(getVersion() >= VersionTuple(1, 9));
+      case llvm::Triple::riscv64:
+        return !(getVersion() >= VersionTuple(2, 2));
+      default:
+        return true;
+      }
+    } else if ((getKind() == MacOSX) && isNonFragile() &&
+               (getVersion() >= VersionTuple(10, 0)) &&
+               (getVersion() < VersionTuple(10, 6)))
+      return Arch != llvm::Triple::x86_64;
     // Except for deployment target of 10.5 or less,
     // Mac runtimes use legacy dispatch everywhere now.
     return true;
diff --git a/clang/test/CodeGenObjC/messages.m b/clang/test/CodeGenObjC/messages.m
index f93d35a8d60c1..41f9d2fbc284c 100644
--- a/clang/test/CodeGenObjC/messages.m
+++ b/clang/test/CodeGenObjC/messages.m
@@ -1,7 +1,8 @@
 // RUN: %clang_cc1 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-MAC
 // RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-MAC-NF
 // RUN: %clang_cc1 -fobjc-runtime=gcc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-GNU
-// RUN: %clang_cc1 -fobjc-runtime=gnustep -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-GNU-NF
+// RUN: %clang_cc1 -fobjc-runtime=gnustep -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-GNU-NF
+// RUN: %clang_cc1 -fobjc-runtime=gnustep-2.2 -fobjc-dispatch-method=non-legacy -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-MAC
 
 typedef struct {
   int x;
diff --git a/clang/test/Driver/gnustep-dispatch-method.m b/clang/test/Driver/gnustep-dispatch-method.m
new file mode 100644
index 0000000000000..6cd043fb27204
--- /dev/null
+++ b/clang/test/Driver/gnustep-dispatch-method.m
@@ -0,0 +1,38 @@
+// DEFINE: %{triple} =
+// DEFINE: %{ver} = 1.6
+// DEFINE: %{prefix} = CHECK-MSGSEND
+// DEFINE: %{check} = %clang --target=%{triple} -fobjc-runtime=gnustep-%{ver} -### -c %s 2>&1 | FileCheck -check-prefix=%{prefix} %s
+
+// REDEFINE: %{ver} = 1.6
+// REDEFINE: %{triple} = i386-unknown-freebsd
+// RUN: %{check}
+// REDEFINE: %{triple} = x86_64-unknown-freebsd
+// RUN: %{check}
+// REDEFINE: %{triple} = arm-unknown-freebsd
+// RUN: %{check}
+// REDEFINE: %{prefix} = CHECK-MSGLOOKUP
+// REDEFINE: %{triple} = aarch64-unknown-freebsd
+// RUN: %{check}
+// REDEFINE: %{triple} = mips64-unknown-freebsd
+// RUN: %{check}
+// REDEFINE: %{triple} = riscv64-unknown-freebsd
+// RUN: %{check}
+
+// REDEFINE: %{ver} = 1.9
+// REDEFINE: %{prefix} = CHECK-MSGSEND
+// REDEFINE: %{triple} = aarch64-unknown-freebsd
+// RUN: %{check}
+// REDEFINE: %{triple} = mips64-unknown-freebsd
+// RUN: %{check}
+// REDEFINE: %{prefix} = CHECK-MSGLOOKUP
+// REDEFINE: %{triple} = riscv64-unknown-freebsd
+// RUN: %{check}
+
+// REDEFINE: %{ver} = 2.2
+// REDEFINE: %{prefix} = CHECK-MSGSEND
+// REDEFINE: %{triple} = riscv64-unknown-freebsd
+// RUN: %{check}
+
+
+// CHECK-MSGSEND: "-cc1"{{.*}} "-fobjc-dispatch-method=non-legacy"
+// CHECK-MSGLOOKUP-NOT: "-cc1"{{.*}} "-fobjc-dispatch-method=non-legacy"

From 76cb0bb7a4540ca7f09b779cc9c3f8694a1ebbd4 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Wed, 3 Jan 2024 09:34:52 -0800
Subject: [PATCH 156/313] [mlir][tensor] Add a pattern to simplify
 tensor.unpack to collpase shape (#76607)

---
 .../Dialect/Tensor/Transforms/Transforms.h    |  1 -
 .../Transforms/PackAndUnpackPatterns.cpp      | 44 +++++++++++-
 .../Dialect/Tensor/simplify-pack-unpack.mlir  | 72 +++++++++++++++++++
 3 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
index 35b519e790d1c..e8a09c4741043 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
@@ -76,7 +76,6 @@ void populateDecomposeTensorConcatPatterns(RewritePatternSet &patterns);
 
 /// Populates `patterns` with patterns that simplify `tensor.pack` and
 /// `tensor.unpack` operations.
-/// TODO: Add a pattern to convert tensor.unpack op to tensor.collapse_shape op.
 void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns);
 
 /// Populates `patterns` with patterns that fold operations like `tensor.pad`
diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
index e20450c95ffd5..cfd838e85c1b8 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
@@ -61,6 +61,47 @@ struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> {
   }
 };
 
+struct SimplifyUnPackToCollapseShape : public OpRewritePattern<UnPackOp> {
+  using OpRewritePattern<UnPackOp>::OpRewritePattern;
+
+  Value insertCollapse(RewriterBase &rewriter, Location loc, Value operand,
+                       Type newOperandType, ArrayAttr reassociation) const {
+    if (operand.getType() == newOperandType)
+      return operand;
+    return rewriter.create<tensor::CollapseShapeOp>(loc, newOperandType,
+                                                    operand, reassociation);
+  }
+
+  LogicalResult matchAndRewrite(UnPackOp unpackOp,
+                                PatternRewriter &rewriter) const override {
+    if (!unpackOp.getOuterDimsPerm().empty()) {
+      return rewriter.notifyMatchFailure(unpackOp,
+                                         "expects no outer_dims_perm");
+    }
+
+    RankedTensorType sourceType = unpackOp.getSourceType();
+    RankedTensorType destType = unpackOp.getDestType();
+    if (!sourceType.hasStaticShape() || !destType.hasStaticShape())
+      return rewriter.notifyMatchFailure(unpackOp, "expects static shapes");
+
+    ArrayRef<int64_t> dimsPos = unpackOp.getInnerDimsPos();
+    if (dimsPos.size() != 1 || (dimsPos[0] + 1 != destType.getRank())) {
+      return rewriter.notifyMatchFailure(
+          unpackOp, "expects unpacking at the innermost dimension");
+    }
+
+    auto reassociation =
+        getReassociationIndicesForReshape(sourceType, destType);
+    if (!reassociation)
+      return failure();
+    Value collapsed = insertCollapse(
+        rewriter, unpackOp.getLoc(), unpackOp.getSource(), destType,
+        getReassociationIndicesAttribute(rewriter, *reassociation));
+    rewriter.replaceOp(unpackOp, collapsed);
+    return success();
+  }
+};
+
 /// Fold a `pad` -> `pack` into `pack` if they have the same padding values and
 /// the pad op has zero low paddings, or if `pack` has no padding values.
 struct FoldPadWithPackOp : public OpRewritePattern<PackOp> {
@@ -191,7 +232,8 @@ void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns) {
 }
 
 void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns) {
-  patterns.add<SimplifyPackToExpandShape>(patterns.getContext());
+  patterns.add<SimplifyPackToExpandShape, SimplifyUnPackToCollapseShape>(
+      patterns.getContext());
 }
 
 } // namespace tensor
diff --git a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
index bdfe18acd86c5..b78ab9bb3fd87 100644
--- a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
@@ -56,3 +56,75 @@ func.func @single_first_inner_dim_packing(%arg0: tensor<256x5xf32>) -> tensor<8x
   %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256x5xf32> -> tensor<8x5x32xf32>
   return %0 : tensor<8x5x32xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @unpack_1d_to_collapse
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<8x32xf32>)
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1]] : tensor<8x32xf32> into tensor<256xf32>
+// CHECK:         return %[[COLLAPSED]]
+func.func @unpack_1d_to_collapse(%arg0: tensor<8x32xf32>) -> tensor<256xf32> {
+  %empty = tensor.empty() : tensor<256xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<256xf32>
+  return %0 : tensor<256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @unpack_to_partial_slice
+// CHECK-NOT:     tensor.collapse
+// CHECK:         tensor.unpack
+func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> {
+  %empty = tensor.empty() : tensor<255xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<255xf32>
+  return %0 : tensor<255xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @unpack_dynamic
+// CHECK-NOT:     tensor.collapse
+// CHECK:         tensor.unpack
+func.func @unpack_dynamic(%arg0: tensor<?x32xf32>) -> tensor<?xf32> {
+  %c32 = arith.constant 32 : index
+  %c0 = arith.constant 0 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?x32xf32>
+  %size = arith.muli %d0, %c32 : index
+  %empty = tensor.empty(%size) : tensor<?xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<?x32xf32> -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @single_last_inner_dim_unpacking(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<5x8x32xf32>)
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<5x8x32xf32> into tensor<5x256xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<5x256xf32>
+func.func @single_last_inner_dim_unpacking(%arg0: tensor<5x8x32xf32>) -> tensor<5x256xf32> {
+  %empty = tensor.empty() : tensor<5x256xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32>
+  return %0 : tensor<5x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @unpacking_with_outer_dims_perm(
+// CHECK-NOT:     tensor.collpase_shape
+// CHECK:         tensor.unpack
+func.func @unpacking_with_outer_dims_perm(%arg0: tensor<8x5x32xf32>) -> tensor<5x256xf32> {
+  %empty = tensor.empty() : tensor<5x256xf32>
+  %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<5x256xf32>
+  return %0 : tensor<5x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @single_first_inner_dim_unpacking(
+// CHECK-NOT:     tensor.collapse_shape
+// CHECK:         tensor.unpack
+func.func @single_first_inner_dim_unpacking(%arg0: tensor<8x5x32xf32>) -> tensor<256x5xf32> {
+  %empty = tensor.empty() : tensor<256x5xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<256x5xf32>
+  return %0 : tensor<256x5xf32>
+}

From 03e29a49d9827532499234e3e460e2b5b29a11a7 Mon Sep 17 00:00:00 2001
From: Puyan Lotfi <34139736+plotfi@users.noreply.github.com>
Date: Wed, 3 Jan 2024 12:36:43 -0500
Subject: [PATCH 157/313] [mlir][Pass] Enable the option for reproducer
 generation without crashing (#75421)

This PR adds API `makeReproducer` and cl::opt flag
`--mlir-generate-reproducer=<filename>` in order to allow for mlir
reproducer dumps even when the pipeline doesn't crash.

This PR also decouples the code that handles generation of an MLIR
reproducer from the crash recovery portion. The purpose is to allow for
generating reproducers outside of the context of a compiler crash.

This will be useful for frameworks and runtimes that use MLIR where it
is needed to reproduce the pipeline behavior for reasons outside of
diagnosing crashes. An example is for diagnosing performance issues
using offline tools, where being able to dump the reproducer from a
runtime compiler would be helpful.
---
 mlir/include/mlir/Pass/PassManager.h          | 36 ++++----
 .../include/mlir/Tools/mlir-opt/MlirOptMain.h |  6 ++
 mlir/lib/Pass/Pass.cpp                        | 18 ++--
 mlir/lib/Pass/PassCrashRecovery.cpp           | 87 ++++++++++++-------
 mlir/lib/Pass/PassDetail.h                    |  5 +-
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp       | 18 ++++
 mlir/test/Pass/crashless-reproducer.mlir      | 10 +++
 7 files changed, 127 insertions(+), 53 deletions(-)
 create mode 100644 mlir/test/Pass/crashless-reproducer.mlir

diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
index d5f1ea0fe0350..1b2e6a3bc82bb 100644
--- a/mlir/include/mlir/Pass/PassManager.h
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -207,6 +207,27 @@ enum class PassDisplayMode {
   Pipeline,
 };
 
+/// Streams on which to output crash reproducer.
+struct ReproducerStream {
+  virtual ~ReproducerStream() = default;
+
+  /// Description of the reproducer stream.
+  virtual StringRef description() = 0;
+
+  /// Stream on which to output reproducer.
+  virtual raw_ostream &os() = 0;
+};
+
+/// Method type for constructing ReproducerStream.
+using ReproducerStreamFactory =
+    std::function<std::unique_ptr<ReproducerStream>(std::string &error)>;
+
+std::string
+makeReproducer(StringRef anchorName,
+               const llvm::iterator_range<OpPassManager::pass_iterator> &passes,
+               Operation *op, StringRef outputFile, bool disableThreads = false,
+               bool verifyPasses = false);
+
 /// The main pass manager and pipeline builder.
 class PassManager : public OpPassManager {
 public:
@@ -243,21 +264,6 @@ class PassManager : public OpPassManager {
   void enableCrashReproducerGeneration(StringRef outputFile,
                                        bool genLocalReproducer = false);
 
-  /// Streams on which to output crash reproducer.
-  struct ReproducerStream {
-    virtual ~ReproducerStream() = default;
-
-    /// Description of the reproducer stream.
-    virtual StringRef description() = 0;
-
-    /// Stream on which to output reproducer.
-    virtual raw_ostream &os() = 0;
-  };
-
-  /// Method type for constructing ReproducerStream.
-  using ReproducerStreamFactory =
-      std::function<std::unique_ptr<ReproducerStream>(std::string &error)>;
-
   /// Enable support for the pass manager to generate a reproducer on the event
   /// of a crash or a pass failure. `factory` is used to construct the streams
   /// to write the generated reproducer to. If `genLocalReproducer` is true, the
diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index e255d9fa70b65..6e90fad1618d2 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -173,6 +173,9 @@ class MlirOptMainConfig {
   }
   bool shouldVerifyRoundtrip() const { return verifyRoundtripFlag; }
 
+  /// Reproducer file generation (no crash required).
+  StringRef getReproducerFilename() const { return generateReproducerFileFlag; }
+
 protected:
   /// Allow operation with no registered dialects.
   /// This option is for convenience during testing only and discouraged in
@@ -228,6 +231,9 @@ class MlirOptMainConfig {
 
   /// Verify that the input IR round-trips perfectly.
   bool verifyRoundtripFlag = false;
+
+  /// The reproducer output filename (no crash required).
+  std::string generateReproducerFileFlag = "";
 };
 
 /// This defines the function type used to setup the pass manager. This can be
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index 810d6a357d52c..5ee0ae6525d17 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -382,16 +382,22 @@ StringRef OpPassManager::getOpAnchorName() const {
 
 /// Prints out the passes of the pass manager as the textual representation
 /// of pipelines.
-void OpPassManager::printAsTextualPipeline(raw_ostream &os) const {
-  os << getOpAnchorName() << "(";
+void printAsTextualPipeline(
+    raw_ostream &os, StringRef anchorName,
+    const llvm::iterator_range<OpPassManager::pass_iterator> &passes) {
+  os << anchorName << "(";
   llvm::interleave(
-      impl->passes,
-      [&](const std::unique_ptr<Pass> &pass) {
-        pass->printAsTextualPipeline(os);
-      },
+      passes, [&](mlir::Pass &pass) { pass.printAsTextualPipeline(os); },
       [&]() { os << ","; });
   os << ")";
 }
+void OpPassManager::printAsTextualPipeline(raw_ostream &os) const {
+  StringRef anchorName = getOpAnchorName();
+  ::printAsTextualPipeline(
+      os, anchorName,
+      {MutableArrayRef<std::unique_ptr<Pass>>{impl->passes}.begin(),
+       MutableArrayRef<std::unique_ptr<Pass>>{impl->passes}.end()});
+}
 
 void OpPassManager::dump() {
   llvm::errs() << "Pass Manager with " << impl->passes.size() << " passes:\n";
diff --git a/mlir/lib/Pass/PassCrashRecovery.cpp b/mlir/lib/Pass/PassCrashRecovery.cpp
index df1a0762ae34a..3f3928ec8b61b 100644
--- a/mlir/lib/Pass/PassCrashRecovery.cpp
+++ b/mlir/lib/Pass/PassCrashRecovery.cpp
@@ -38,7 +38,7 @@ namespace detail {
 /// reproducers when a signal is raised, such as a segfault.
 struct RecoveryReproducerContext {
   RecoveryReproducerContext(std::string passPipelineStr, Operation *op,
-                            PassManager::ReproducerStreamFactory &streamFactory,
+                            ReproducerStreamFactory &streamFactory,
                             bool verifyPasses);
   ~RecoveryReproducerContext();
 
@@ -67,7 +67,7 @@ struct RecoveryReproducerContext {
 
   /// The factory for the reproducer output stream to use when generating the
   /// reproducer.
-  PassManager::ReproducerStreamFactory &streamFactory;
+  ReproducerStreamFactory &streamFactory;
 
   /// Various pass manager and context flags.
   bool disableThreads;
@@ -92,7 +92,7 @@ llvm::ManagedStatic<llvm::SmallSetVector<RecoveryReproducerContext *, 1>>
 
 RecoveryReproducerContext::RecoveryReproducerContext(
     std::string passPipelineStr, Operation *op,
-    PassManager::ReproducerStreamFactory &streamFactory, bool verifyPasses)
+    ReproducerStreamFactory &streamFactory, bool verifyPasses)
     : pipelineElements(std::move(passPipelineStr)),
       preCrashOperation(op->clone()), streamFactory(streamFactory),
       disableThreads(!op->getContext()->isMultithreadingEnabled()),
@@ -106,22 +106,24 @@ RecoveryReproducerContext::~RecoveryReproducerContext() {
   disable();
 }
 
-void RecoveryReproducerContext::generate(std::string &description) {
+static void appendReproducer(std::string &description, Operation *op,
+                             const ReproducerStreamFactory &factory,
+                             const std::string &pipelineElements,
+                             bool disableThreads, bool verifyPasses) {
   llvm::raw_string_ostream descOS(description);
 
   // Try to create a new output stream for this crash reproducer.
   std::string error;
-  std::unique_ptr<PassManager::ReproducerStream> stream = streamFactory(error);
+  std::unique_ptr<ReproducerStream> stream = factory(error);
   if (!stream) {
     descOS << "failed to create output stream: " << error;
     return;
   }
   descOS << "reproducer generated at `" << stream->description() << "`";
 
-  std::string pipeline = (preCrashOperation->getName().getStringRef() + "(" +
-                          pipelineElements + ")")
-                             .str();
-  AsmState state(preCrashOperation);
+  std::string pipeline =
+      (op->getName().getStringRef() + "(" + pipelineElements + ")").str();
+  AsmState state(op);
   state.attachResourcePrinter(
       "mlir_reproducer", [&](Operation *op, AsmResourceBuilder &builder) {
         builder.buildString("pipeline", pipeline);
@@ -130,7 +132,12 @@ void RecoveryReproducerContext::generate(std::string &description) {
       });
 
   // Output the .mlir module.
-  preCrashOperation->print(stream->os(), state);
+  op->print(stream->os(), state);
+}
+
+void RecoveryReproducerContext::generate(std::string &description) {
+  appendReproducer(description, preCrashOperation, streamFactory,
+                   pipelineElements, disableThreads, verifyPasses);
 }
 
 void RecoveryReproducerContext::disable() {
@@ -175,12 +182,11 @@ void RecoveryReproducerContext::registerSignalHandler() {
 //===----------------------------------------------------------------------===//
 
 struct PassCrashReproducerGenerator::Impl {
-  Impl(PassManager::ReproducerStreamFactory &streamFactory,
-       bool localReproducer)
+  Impl(ReproducerStreamFactory &streamFactory, bool localReproducer)
       : streamFactory(streamFactory), localReproducer(localReproducer) {}
 
   /// The factory to use when generating a crash reproducer.
-  PassManager::ReproducerStreamFactory streamFactory;
+  ReproducerStreamFactory streamFactory;
 
   /// Flag indicating if reproducer generation should be localized to the
   /// failing pass.
@@ -198,7 +204,7 @@ struct PassCrashReproducerGenerator::Impl {
 };
 
 PassCrashReproducerGenerator::PassCrashReproducerGenerator(
-    PassManager::ReproducerStreamFactory &streamFactory, bool localReproducer)
+    ReproducerStreamFactory &streamFactory, bool localReproducer)
     : impl(std::make_unique<Impl>(streamFactory, localReproducer)) {}
 PassCrashReproducerGenerator::~PassCrashReproducerGenerator() = default;
 
@@ -382,9 +388,9 @@ struct CrashReproducerInstrumentation : public PassInstrumentation {
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// This class represents a default instance of PassManager::ReproducerStream
+/// This class represents a default instance of mlir::ReproducerStream
 /// that is backed by a file.
-struct FileReproducerStream : public PassManager::ReproducerStream {
+struct FileReproducerStream : public mlir::ReproducerStream {
   FileReproducerStream(std::unique_ptr<llvm::ToolOutputFile> outputFile)
       : outputFile(std::move(outputFile)) {}
   ~FileReproducerStream() override { outputFile->keep(); }
@@ -418,22 +424,45 @@ LogicalResult PassManager::runWithCrashRecovery(Operation *op,
   return passManagerResult;
 }
 
-void PassManager::enableCrashReproducerGeneration(StringRef outputFile,
-                                                  bool genLocalReproducer) {
+static ReproducerStreamFactory
+makeReproducerStreamFactory(StringRef outputFile) {
   // Capture the filename by value in case outputFile is out of scope when
   // invoked.
   std::string filename = outputFile.str();
-  enableCrashReproducerGeneration(
-      [filename](std::string &error) -> std::unique_ptr<ReproducerStream> {
-        std::unique_ptr<llvm::ToolOutputFile> outputFile =
-            mlir::openOutputFile(filename, &error);
-        if (!outputFile) {
-          error = "Failed to create reproducer stream: " + error;
-          return nullptr;
-        }
-        return std::make_unique<FileReproducerStream>(std::move(outputFile));
-      },
-      genLocalReproducer);
+  return [filename](std::string &error) -> std::unique_ptr<ReproducerStream> {
+    std::unique_ptr<llvm::ToolOutputFile> outputFile =
+        mlir::openOutputFile(filename, &error);
+    if (!outputFile) {
+      error = "Failed to create reproducer stream: " + error;
+      return nullptr;
+    }
+    return std::make_unique<FileReproducerStream>(std::move(outputFile));
+  };
+}
+
+void printAsTextualPipeline(
+    raw_ostream &os, StringRef anchorName,
+    const llvm::iterator_range<OpPassManager::pass_iterator> &passes);
+
+std::string mlir::makeReproducer(
+    StringRef anchorName,
+    const llvm::iterator_range<OpPassManager::pass_iterator> &passes,
+    Operation *op, StringRef outputFile, bool disableThreads,
+    bool verifyPasses) {
+
+  std::string description;
+  std::string pipelineStr;
+  llvm::raw_string_ostream passOS(pipelineStr);
+  ::printAsTextualPipeline(passOS, anchorName, passes);
+  appendReproducer(description, op, makeReproducerStreamFactory(outputFile),
+                   pipelineStr, disableThreads, verifyPasses);
+  return description;
+}
+
+void PassManager::enableCrashReproducerGeneration(StringRef outputFile,
+                                                  bool genLocalReproducer) {
+  enableCrashReproducerGeneration(makeReproducerStreamFactory(outputFile),
+                                  genLocalReproducer);
 }
 
 void PassManager::enableCrashReproducerGeneration(
diff --git a/mlir/lib/Pass/PassDetail.h b/mlir/lib/Pass/PassDetail.h
index 0e964b6d6d36b..5cc726295c9f1 100644
--- a/mlir/lib/Pass/PassDetail.h
+++ b/mlir/lib/Pass/PassDetail.h
@@ -98,9 +98,8 @@ class OpToOpPassAdaptor
 
 class PassCrashReproducerGenerator {
 public:
-  PassCrashReproducerGenerator(
-      PassManager::ReproducerStreamFactory &streamFactory,
-      bool localReproducer);
+  PassCrashReproducerGenerator(ReproducerStreamFactory &streamFactory,
+                               bool localReproducer);
   ~PassCrashReproducerGenerator();
 
   /// Initialize the generator in preparation for reproducer generation. The
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index d7d47619ef4ac..5395aa2b502d7 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -151,6 +151,16 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
 
     static cl::list<std::string> passPlugins(
         "load-pass-plugin", cl::desc("Load passes from plugin library"));
+
+    static cl::opt<std::string, /*ExternalStorage=*/true>
+        generateReproducerFile(
+            "mlir-generate-reproducer",
+            llvm::cl::desc(
+                "Generate an mlir reproducer at the provided filename"
+                " (no crash required)"),
+            cl::location(generateReproducerFileFlag), cl::init(""),
+            cl::value_desc("filename"));
+
     /// Set the callback to load a pass plugin.
     passPlugins.setCallback([&](const std::string &pluginPath) {
       auto plugin = PassPlugin::load(pluginPath);
@@ -384,6 +394,14 @@ performActions(raw_ostream &os,
   if (failed(pm.run(*op)))
     return failure();
 
+  // Generate reproducers if requested
+  if (!config.getReproducerFilename().empty()) {
+    StringRef anchorName = pm.getAnyOpAnchorName();
+    const auto &passes = pm.getPasses();
+    makeReproducer(anchorName, passes, op.get(),
+                   config.getReproducerFilename());
+  }
+
   // Print the output.
   TimingScope outputTiming = timing.nest("Output");
   if (config.shouldEmitBytecode()) {
diff --git a/mlir/test/Pass/crashless-reproducer.mlir b/mlir/test/Pass/crashless-reproducer.mlir
new file mode 100644
index 0000000000000..d874d90b8f3db
--- /dev/null
+++ b/mlir/test/Pass/crashless-reproducer.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(builtin.module(test-module-pass))' --mlir-generate-reproducer=%t -verify-diagnostics
+// RUN: cat %t | FileCheck -check-prefix=REPRO %s
+
+module @inner_mod1 {
+  module @foo {}
+}
+
+// REPRO: module @inner_mod1
+// REPRO: module @foo {
+// REPRO: pipeline: "builtin.module(any(builtin.module(test-module-pass)))"

From fc0fdd1ae20062e4d77c1b7ffc5b06773c752815 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Wed, 3 Jan 2024 17:31:16 +0000
Subject: [PATCH 158/313] [mlir] fix AsmPrinter after c1eab57673ef3e

The change in c1eab57673ef3eb2842c0fbe454d7878854cf54c fixed the
behavior of `getDiscardableAttrDictionary` for ops that are not using
properties to only return discardable attributes. AsmPrinter was relying
on the wrong behavior when printing such ops in the generic form,
assuming all attributes are discardable.
---
 mlir/lib/IR/AsmPrinter.cpp             | 5 +++--
 mlir/unittests/IR/OpPropertiesTest.cpp | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 0cc6d11ddc576..8fe8c78efecf9 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -3542,8 +3542,9 @@ void OperationPrinter::printGenericOp(Operation *op, bool printOpName) {
     os << ')';
   }
 
-  auto attrs = op->getDiscardableAttrs();
-  printOptionalAttrDict(llvm::to_vector(attrs));
+  printOptionalAttrDict(op->getPropertiesStorage()
+                            ? llvm::to_vector(op->getDiscardableAttrs())
+                            : op->getAttrs());
 
   // Print the type signature of the operation.
   os << " : ";
diff --git a/mlir/unittests/IR/OpPropertiesTest.cpp b/mlir/unittests/IR/OpPropertiesTest.cpp
index 6dcba16e656b3..bb1b741d1cc22 100644
--- a/mlir/unittests/IR/OpPropertiesTest.cpp
+++ b/mlir/unittests/IR/OpPropertiesTest.cpp
@@ -395,6 +395,12 @@ TEST(OpPropertiesTest, withoutPropertiesDiscardableAttrs) {
   EXPECT_EQ(op->getAttrs().size(), 2u);
   EXPECT_TRUE(op->getInherentAttr("inherent_attr") != std::nullopt);
   EXPECT_TRUE(op->getDiscardableAttr("other_attr") != Attribute());
+
+  std::string output;
+  llvm::raw_string_ostream os(output);
+  op->print(os);
+  EXPECT_TRUE(StringRef(os.str()).contains("inherent_attr = 42"));
+  EXPECT_TRUE(StringRef(os.str()).contains("other_attr = 56"));
 }
 
 } // namespace

From 5930725c891b60f5fb94058c6c08a55a2e03d83e Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 3 Jan 2024 09:43:22 -0800
Subject: [PATCH 159/313] [mlir] Add config for PDL (#69927)

Make it so that PDL in pattern rewrites can be optionally disabled.

PDL is still enabled by default and not optional bazel. So this should
be a NOP for most folks, while enabling other to disable.

This is piped through mlir-tblgen invocation and that could be
changed/avoided by splitting up the passes file instead.

This only works with tests disabled. With tests enabled this still
compiles but tests fail as there is no lit config to disable tests that
depend on PDL rewrites yet.
---
 mlir/CMakeLists.txt                           |   5 +-
 mlir/examples/minimal-opt/README.md           |   9 +-
 mlir/include/mlir/Config/mlir-config.h.cmake  |   3 +
 .../Conversion/LLVMCommon/TypeConverter.h     |   1 +
 .../mlir/Dialect/Vector/IR/VectorOps.h        |   1 +
 mlir/include/mlir/IR/PDLPatternMatch.h.inc    | 995 ++++++++++++++++++
 mlir/include/mlir/IR/PatternMatch.h           | 935 +---------------
 .../mlir/Transforms/DialectConversion.h       |  15 +
 .../Bufferization/TransformOps/CMakeLists.txt |   1 -
 mlir/lib/IR/CMakeLists.txt                    |   7 +
 mlir/lib/IR/PDL/CMakeLists.txt                |   7 +
 mlir/lib/IR/PDL/PDLPatternMatch.cpp           | 133 +++
 mlir/lib/IR/PatternMatch.cpp                  | 119 +--
 mlir/lib/Rewrite/ByteCode.h                   |  36 +
 mlir/lib/Rewrite/CMakeLists.txt               |  32 +-
 mlir/lib/Rewrite/FrozenRewritePatternSet.cpp  |  10 +-
 mlir/lib/Rewrite/PatternApplicator.cpp        |   4 +-
 .../Transforms/Utils/DialectConversion.cpp    |   3 +
 mlir/test/CMakeLists.txt                      |  12 +-
 mlir/test/lib/Transforms/CMakeLists.txt       |  14 +-
 mlir/tools/mlir-lsp-server/CMakeLists.txt     |   6 +-
 mlir/tools/mlir-opt/CMakeLists.txt            |   8 +-
 mlir/tools/mlir-opt/mlir-opt.cpp              |   7 +-
 .../llvm-project-overlay/mlir/BUILD.bazel     |   4 +
 24 files changed, 1297 insertions(+), 1070 deletions(-)
 create mode 100644 mlir/include/mlir/IR/PDLPatternMatch.h.inc
 create mode 100644 mlir/lib/IR/PDL/CMakeLists.txt
 create mode 100644 mlir/lib/IR/PDL/PDLPatternMatch.cpp

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index dcc068e4097c5..3de8677aefe90 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -133,6 +133,8 @@ set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
     "Statically link the nvptxlibrary instead of calling ptxas as a subprocess \
     for compiling PTX to cubin")
 
+set(MLIR_ENABLE_PDL_IN_PATTERNMATCH 1 CACHE BOOL "Enable PDL in PatternMatch")
+
 option(MLIR_INCLUDE_TESTS
        "Generate build targets for the MLIR unit tests."
        ${LLVM_INCLUDE_TESTS})
@@ -178,10 +180,9 @@ include_directories( ${MLIR_INCLUDE_DIR})
 # Adding tools/mlir-tblgen here as calling add_tablegen sets some variables like
 # MLIR_TABLEGEN_EXE in PARENT_SCOPE which gets lost if that folder is included
 # from another directory like tools
-add_subdirectory(tools/mlir-tblgen)
 add_subdirectory(tools/mlir-linalg-ods-gen)
 add_subdirectory(tools/mlir-pdll)
-
+add_subdirectory(tools/mlir-tblgen)
 set(MLIR_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_TABLEGEN_TARGET "${MLIR_TABLEGEN_TARGET}" CACHE INTERNAL "")
 set(MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE "${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE}" CACHE INTERNAL "")
diff --git a/mlir/examples/minimal-opt/README.md b/mlir/examples/minimal-opt/README.md
index b8a455f7a7966..1bc54b8367cc5 100644
--- a/mlir/examples/minimal-opt/README.md
+++ b/mlir/examples/minimal-opt/README.md
@@ -14,10 +14,10 @@ Below are some example measurements taken at the time of the LLVM 17 release,
 using clang-14 on a X86 Ubuntu and [bloaty](https://github.com/google/bloaty).
 
 |                                  | Base   | Os     | Oz     | Os LTO | Oz LTO |
-| :-----------------------------: | ------ | ------ | ------ | ------ | ------ |
-| `mlir-cat`                      | 1018kB | 836KB  | 879KB  | 697KB  | 649KB  |
-| `mlir-minimal-opt`              | 1.54MB | 1.25MB | 1.29MB | 1.10MB | 1.00MB |
-| `mlir-minimal-opt-canonicalize` | 2.24MB | 1.81MB | 1.86MB | 1.62MB | 1.48MB |
+| :------------------------------: | ------ | ------ | ------ | ------ | ------ |
+| `mlir-cat`                       | 1024KB |  840KB |  885KB |  706KB |  657KB |
+| `mlir-minimal-opt`               | 1.62MB | 1.32MB | 1.36MB | 1.17MB | 1.07MB |
+| `mlir-minimal-opt-canonicalize`  | 1.83MB | 1.40MB | 1.45MB | 1.25MB | 1.14MB |
 
 Base configuration:
 
@@ -32,6 +32,7 @@ cmake ../llvm/ -G Ninja \
    -DCMAKE_CXX_COMPILER=clang++ \
    -DLLVM_ENABLE_LLD=ON \
    -DLLVM_ENABLE_BACKTRACES=OFF \
+   -DMLIR_ENABLE_PDL_IN_PATTERNMATCH=OFF \
    -DCMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=-Wl,-icf=all
 ```
 
diff --git a/mlir/include/mlir/Config/mlir-config.h.cmake b/mlir/include/mlir/Config/mlir-config.h.cmake
index efa77b2e5ce5d..e152a36c0ce0c 100644
--- a/mlir/include/mlir/Config/mlir-config.h.cmake
+++ b/mlir/include/mlir/Config/mlir-config.h.cmake
@@ -26,4 +26,7 @@
    numeric seed that is passed to the random number generator. */
 #cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}
 
+/* If set, enables PDL usage. */
+#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
 #endif
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
index 74f9c977b7028..e228229302cff 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
@@ -15,6 +15,7 @@
 #define MLIR_CONVERSION_LLVMCOMMON_TYPECONVERTER_H
 
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
index a28b27e4e1581..4603953cb40fa 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
@@ -29,6 +29,7 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 
 // Pull in all enum type definitions and utility function declarations.
diff --git a/mlir/include/mlir/IR/PDLPatternMatch.h.inc b/mlir/include/mlir/IR/PDLPatternMatch.h.inc
new file mode 100644
index 0000000000000..a215da8cb6431
--- /dev/null
+++ b/mlir/include/mlir/IR/PDLPatternMatch.h.inc
@@ -0,0 +1,995 @@
+//===- PDLPatternMatch.h - PDLPatternMatcher classes -------==---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_PDLPATTERNMATCH_H
+#define MLIR_IR_PDLPATTERNMATCH_H
+
+#include "mlir/Config/mlir-config.h"
+
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+
+namespace mlir {
+//===----------------------------------------------------------------------===//
+// PDL Patterns
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// PDLValue
+
+/// Storage type of byte-code interpreter values. These are passed to constraint
+/// functions as arguments.
+class PDLValue {
+public:
+  /// The underlying kind of a PDL value.
+  enum class Kind { Attribute, Operation, Type, TypeRange, Value, ValueRange };
+
+  /// Construct a new PDL value.
+  PDLValue(const PDLValue &other) = default;
+  PDLValue(std::nullptr_t = nullptr) {}
+  PDLValue(Attribute value)
+      : value(value.getAsOpaquePointer()), kind(Kind::Attribute) {}
+  PDLValue(Operation *value) : value(value), kind(Kind::Operation) {}
+  PDLValue(Type value) : value(value.getAsOpaquePointer()), kind(Kind::Type) {}
+  PDLValue(TypeRange *value) : value(value), kind(Kind::TypeRange) {}
+  PDLValue(Value value)
+      : value(value.getAsOpaquePointer()), kind(Kind::Value) {}
+  PDLValue(ValueRange *value) : value(value), kind(Kind::ValueRange) {}
+
+  /// Returns true if the type of the held value is `T`.
+  template <typename T>
+  bool isa() const {
+    assert(value && "isa<> used on a null value");
+    return kind == getKindOf<T>();
+  }
+
+  /// Attempt to dynamically cast this value to type `T`, returns null if this
+  /// value is not an instance of `T`.
+  template <typename T,
+            typename ResultT = std::conditional_t<
+                std::is_convertible<T, bool>::value, T, std::optional<T>>>
+  ResultT dyn_cast() const {
+    return isa<T>() ? castImpl<T>() : ResultT();
+  }
+
+  /// Cast this value to type `T`, asserts if this value is not an instance of
+  /// `T`.
+  template <typename T>
+  T cast() const {
+    assert(isa<T>() && "expected value to be of type `T`");
+    return castImpl<T>();
+  }
+
+  /// Get an opaque pointer to the value.
+  const void *getAsOpaquePointer() const { return value; }
+
+  /// Return if this value is null or not.
+  explicit operator bool() const { return value; }
+
+  /// Return the kind of this value.
+  Kind getKind() const { return kind; }
+
+  /// Print this value to the provided output stream.
+  void print(raw_ostream &os) const;
+
+  /// Print the specified value kind to an output stream.
+  static void print(raw_ostream &os, Kind kind);
+
+private:
+  /// Find the index of a given type in a range of other types.
+  template <typename...>
+  struct index_of_t;
+  template <typename T, typename... R>
+  struct index_of_t<T, T, R...> : std::integral_constant<size_t, 0> {};
+  template <typename T, typename F, typename... R>
+  struct index_of_t<T, F, R...>
+      : std::integral_constant<size_t, 1 + index_of_t<T, R...>::value> {};
+
+  /// Return the kind used for the given T.
+  template <typename T>
+  static Kind getKindOf() {
+    return static_cast<Kind>(index_of_t<T, Attribute, Operation *, Type,
+                                        TypeRange, Value, ValueRange>::value);
+  }
+
+  /// The internal implementation of `cast`, that returns the underlying value
+  /// as the given type `T`.
+  template <typename T>
+  std::enable_if_t<llvm::is_one_of<T, Attribute, Type, Value>::value, T>
+  castImpl() const {
+    return T::getFromOpaquePointer(value);
+  }
+  template <typename T>
+  std::enable_if_t<llvm::is_one_of<T, TypeRange, ValueRange>::value, T>
+  castImpl() const {
+    return *reinterpret_cast<T *>(const_cast<void *>(value));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_pointer<T>::value, T> castImpl() const {
+    return reinterpret_cast<T>(const_cast<void *>(value));
+  }
+
+  /// The internal opaque representation of a PDLValue.
+  const void *value{nullptr};
+  /// The kind of the opaque value.
+  Kind kind{Kind::Attribute};
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, PDLValue value) {
+  value.print(os);
+  return os;
+}
+
+inline raw_ostream &operator<<(raw_ostream &os, PDLValue::Kind kind) {
+  PDLValue::print(os, kind);
+  return os;
+}
+
+//===----------------------------------------------------------------------===//
+// PDLResultList
+
+/// The class represents a list of PDL results, returned by a native rewrite
+/// method. It provides the mechanism with which to pass PDLValues back to the
+/// PDL bytecode.
+class PDLResultList {
+public:
+  /// Push a new Attribute value onto the result list.
+  void push_back(Attribute value) { results.push_back(value); }
+
+  /// Push a new Operation onto the result list.
+  void push_back(Operation *value) { results.push_back(value); }
+
+  /// Push a new Type onto the result list.
+  void push_back(Type value) { results.push_back(value); }
+
+  /// Push a new TypeRange onto the result list.
+  void push_back(TypeRange value) {
+    // The lifetime of a TypeRange can't be guaranteed, so we'll need to
+    // allocate a storage for it.
+    llvm::OwningArrayRef<Type> storage(value.size());
+    llvm::copy(value, storage.begin());
+    allocatedTypeRanges.emplace_back(std::move(storage));
+    typeRanges.push_back(allocatedTypeRanges.back());
+    results.push_back(&typeRanges.back());
+  }
+  void push_back(ValueTypeRange<OperandRange> value) {
+    typeRanges.push_back(value);
+    results.push_back(&typeRanges.back());
+  }
+  void push_back(ValueTypeRange<ResultRange> value) {
+    typeRanges.push_back(value);
+    results.push_back(&typeRanges.back());
+  }
+
+  /// Push a new Value onto the result list.
+  void push_back(Value value) { results.push_back(value); }
+
+  /// Push a new ValueRange onto the result list.
+  void push_back(ValueRange value) {
+    // The lifetime of a ValueRange can't be guaranteed, so we'll need to
+    // allocate a storage for it.
+    llvm::OwningArrayRef<Value> storage(value.size());
+    llvm::copy(value, storage.begin());
+    allocatedValueRanges.emplace_back(std::move(storage));
+    valueRanges.push_back(allocatedValueRanges.back());
+    results.push_back(&valueRanges.back());
+  }
+  void push_back(OperandRange value) {
+    valueRanges.push_back(value);
+    results.push_back(&valueRanges.back());
+  }
+  void push_back(ResultRange value) {
+    valueRanges.push_back(value);
+    results.push_back(&valueRanges.back());
+  }
+
+protected:
+  /// Create a new result list with the expected number of results.
+  PDLResultList(unsigned maxNumResults) {
+    // For now just reserve enough space for all of the results. We could do
+    // separate counts per range type, but it isn't really worth it unless there
+    // are a "large" number of results.
+    typeRanges.reserve(maxNumResults);
+    valueRanges.reserve(maxNumResults);
+  }
+
+  /// The PDL results held by this list.
+  SmallVector<PDLValue> results;
+  /// Memory used to store ranges held by the list.
+  SmallVector<TypeRange> typeRanges;
+  SmallVector<ValueRange> valueRanges;
+  /// Memory allocated to store ranges in the result list whose lifetime was
+  /// generated in the native function.
+  SmallVector<llvm::OwningArrayRef<Type>> allocatedTypeRanges;
+  SmallVector<llvm::OwningArrayRef<Value>> allocatedValueRanges;
+};
+
+//===----------------------------------------------------------------------===//
+// PDLPatternConfig
+
+/// An individual configuration for a pattern, which can be accessed by native
+/// functions via the PDLPatternConfigSet. This allows for injecting additional
+/// configuration into PDL patterns that is specific to certain compilation
+/// flows.
+class PDLPatternConfig {
+public:
+  virtual ~PDLPatternConfig() = default;
+
+  /// Hooks that are invoked at the beginning and end of a rewrite of a matched
+  /// pattern. These can be used to setup any specific state necessary for the
+  /// rewrite.
+  virtual void notifyRewriteBegin(PatternRewriter &rewriter) {}
+  virtual void notifyRewriteEnd(PatternRewriter &rewriter) {}
+
+  /// Return the TypeID that represents this configuration.
+  TypeID getTypeID() const { return id; }
+
+protected:
+  PDLPatternConfig(TypeID id) : id(id) {}
+
+private:
+  TypeID id;
+};
+
+/// This class provides a base class for users implementing a type of pattern
+/// configuration.
+template <typename T>
+class PDLPatternConfigBase : public PDLPatternConfig {
+public:
+  /// Support LLVM style casting.
+  static bool classof(const PDLPatternConfig *config) {
+    return config->getTypeID() == getConfigID();
+  }
+
+  /// Return the type id used for this configuration.
+  static TypeID getConfigID() { return TypeID::get<T>(); }
+
+protected:
+  PDLPatternConfigBase() : PDLPatternConfig(getConfigID()) {}
+};
+
+/// This class contains a set of configurations for a specific pattern.
+/// Configurations are uniqued by TypeID, meaning that only one configuration of
+/// each type is allowed.
+class PDLPatternConfigSet {
+public:
+  PDLPatternConfigSet() = default;
+
+  /// Construct a set with the given configurations.
+  template <typename... ConfigsT>
+  PDLPatternConfigSet(ConfigsT &&...configs) {
+    (addConfig(std::forward<ConfigsT>(configs)), ...);
+  }
+
+  /// Get the configuration defined by the given type. Asserts that the
+  /// configuration of the provided type exists.
+  template <typename T>
+  const T &get() const {
+    const T *config = tryGet<T>();
+    assert(config && "configuration not found");
+    return *config;
+  }
+
+  /// Get the configuration defined by the given type, returns nullptr if the
+  /// configuration does not exist.
+  template <typename T>
+  const T *tryGet() const {
+    for (const auto &configIt : configs)
+      if (const T *config = dyn_cast<T>(configIt.get()))
+        return config;
+    return nullptr;
+  }
+
+  /// Notify the configurations within this set at the beginning or end of a
+  /// rewrite of a matched pattern.
+  void notifyRewriteBegin(PatternRewriter &rewriter) {
+    for (const auto &config : configs)
+      config->notifyRewriteBegin(rewriter);
+  }
+  void notifyRewriteEnd(PatternRewriter &rewriter) {
+    for (const auto &config : configs)
+      config->notifyRewriteEnd(rewriter);
+  }
+
+protected:
+  /// Add a configuration to the set.
+  template <typename T>
+  void addConfig(T &&config) {
+    assert(!tryGet<std::decay_t<T>>() && "configuration already exists");
+    configs.emplace_back(
+        std::make_unique<std::decay_t<T>>(std::forward<T>(config)));
+  }
+
+  /// The set of configurations for this pattern. This uses a vector instead of
+  /// a map with the expectation that the number of configurations per set is
+  /// small (<= 1).
+  SmallVector<std::unique_ptr<PDLPatternConfig>> configs;
+};
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+
+/// A generic PDL pattern constraint function. This function applies a
+/// constraint to a given set of opaque PDLValue entities. Returns success if
+/// the constraint successfully held, failure otherwise.
+using PDLConstraintFunction =
+    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
+/// A native PDL rewrite function. This function performs a rewrite on the
+/// given set of values. Any results from this rewrite that should be passed
+/// back to PDL should be added to the provided result list. This method is only
+/// invoked when the corresponding match was successful. Returns failure if an
+/// invariant of the rewrite was broken (certain rewriters may recover from
+/// partial pattern application).
+using PDLRewriteFunction = std::function<LogicalResult(
+    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
+
+namespace detail {
+namespace pdl_function_builder {
+/// A utility variable that always resolves to false. This is useful for static
+/// asserts that are always false, but only should fire in certain templated
+/// constructs. For example, if a templated function should never be called, the
+/// function could be defined as:
+///
+/// template <typename T>
+/// void foo() {
+///  static_assert(always_false<T>, "This function should never be called");
+/// }
+///
+template <class... T>
+constexpr bool always_false = false;
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Type Processing
+//===----------------------------------------------------------------------===//
+
+/// This struct provides a convenient way to determine how to process a given
+/// type as either a PDL parameter, or a result value. This allows for
+/// supporting complex types in constraint and rewrite functions, without
+/// requiring the user to hand-write the necessary glue code themselves.
+/// Specializations of this class should implement the following methods to
+/// enable support as a PDL argument or result type:
+///
+///   static LogicalResult verifyAsArg(
+///     function_ref<LogicalResult(const Twine &)> errorFn, PDLValue pdlValue,
+///     size_t argIdx);
+///
+///     * This method verifies that the given PDLValue is valid for use as a
+///       value of `T`.
+///
+///   static T processAsArg(PDLValue pdlValue);
+///
+///     *  This method processes the given PDLValue as a value of `T`.
+///
+///   static void processAsResult(PatternRewriter &, PDLResultList &results,
+///                               const T &value);
+///
+///     *  This method processes the given value of `T` as the result of a
+///        function invocation. The method should package the value into an
+///        appropriate form and append it to the given result list.
+///
+/// If the type `T` is based on a higher order value, consider using
+/// `ProcessPDLValueBasedOn` as a base class of the specialization to simplify
+/// the implementation.
+///
+template <typename T, typename Enable = void>
+struct ProcessPDLValue;
+
+/// This struct provides a simplified model for processing types that are based
+/// on another type, e.g. APInt is based on the handling for IntegerAttr. This
+/// allows for building the necessary processing functions on top of the base
+/// value instead of a PDLValue. Derived users should implement the following
+/// (which subsume the ProcessPDLValue variants):
+///
+///   static LogicalResult verifyAsArg(
+///     function_ref<LogicalResult(const Twine &)> errorFn,
+///     const BaseT &baseValue, size_t argIdx);
+///
+///     * This method verifies that the given PDLValue is valid for use as a
+///       value of `T`.
+///
+///   static T processAsArg(BaseT baseValue);
+///
+///     *  This method processes the given base value as a value of `T`.
+///
+template <typename T, typename BaseT>
+struct ProcessPDLValueBasedOn {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              PDLValue pdlValue, size_t argIdx) {
+    // Verify the base class before continuing.
+    if (failed(ProcessPDLValue<BaseT>::verifyAsArg(errorFn, pdlValue, argIdx)))
+      return failure();
+    return ProcessPDLValue<T>::verifyAsArg(
+        errorFn, ProcessPDLValue<BaseT>::processAsArg(pdlValue), argIdx);
+  }
+  static T processAsArg(PDLValue pdlValue) {
+    return ProcessPDLValue<T>::processAsArg(
+        ProcessPDLValue<BaseT>::processAsArg(pdlValue));
+  }
+
+  /// Explicitly add the expected parent API to ensure the parent class
+  /// implements the necessary API (and doesn't implicitly inherit it from
+  /// somewhere else).
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn, BaseT value,
+              size_t argIdx) {
+    return success();
+  }
+  static T processAsArg(BaseT baseValue);
+};
+
+/// This struct provides a simplified model for processing types that have
+/// "builtin" PDLValue support:
+///   * Attribute, Operation *, Type, TypeRange, ValueRange
+template <typename T>
+struct ProcessBuiltinPDLValue {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              PDLValue pdlValue, size_t argIdx) {
+    if (pdlValue)
+      return success();
+    return errorFn("expected a non-null value for argument " + Twine(argIdx) +
+                   " of type: " + llvm::getTypeName<T>());
+  }
+
+  static T processAsArg(PDLValue pdlValue) { return pdlValue.cast<T>(); }
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              T value) {
+    results.push_back(value);
+  }
+};
+
+/// This struct provides a simplified model for processing types that inherit
+/// from builtin PDLValue types. For example, derived attributes like
+/// IntegerAttr, derived types like IntegerType, derived operations like
+/// ModuleOp, Interfaces, etc.
+template <typename T, typename BaseT>
+struct ProcessDerivedPDLValue : public ProcessPDLValueBasedOn<T, BaseT> {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              BaseT baseValue, size_t argIdx) {
+    return TypeSwitch<BaseT, LogicalResult>(baseValue)
+        .Case([&](T) { return success(); })
+        .Default([&](BaseT) {
+          return errorFn("expected argument " + Twine(argIdx) +
+                         " to be of type: " + llvm::getTypeName<T>());
+        });
+  }
+  using ProcessPDLValueBasedOn<T, BaseT>::verifyAsArg;
+
+  static T processAsArg(BaseT baseValue) {
+    return baseValue.template cast<T>();
+  }
+  using ProcessPDLValueBasedOn<T, BaseT>::processAsArg;
+
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              T value) {
+    results.push_back(value);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Attribute
+
+template <>
+struct ProcessPDLValue<Attribute> : public ProcessBuiltinPDLValue<Attribute> {};
+template <typename T>
+struct ProcessPDLValue<T,
+                       std::enable_if_t<std::is_base_of<Attribute, T>::value>>
+    : public ProcessDerivedPDLValue<T, Attribute> {};
+
+/// Handling for various Attribute value types.
+template <>
+struct ProcessPDLValue<StringRef>
+    : public ProcessPDLValueBasedOn<StringRef, StringAttr> {
+  static StringRef processAsArg(StringAttr value) { return value.getValue(); }
+  using ProcessPDLValueBasedOn<StringRef, StringAttr>::processAsArg;
+
+  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
+                              StringRef value) {
+    results.push_back(rewriter.getStringAttr(value));
+  }
+};
+template <>
+struct ProcessPDLValue<std::string>
+    : public ProcessPDLValueBasedOn<std::string, StringAttr> {
+  template <typename T>
+  static std::string processAsArg(T value) {
+    static_assert(always_false<T>,
+                  "`std::string` arguments require a string copy, use "
+                  "`StringRef` for string-like arguments instead");
+    return {};
+  }
+  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
+                              StringRef value) {
+    results.push_back(rewriter.getStringAttr(value));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Operation
+
+template <>
+struct ProcessPDLValue<Operation *>
+    : public ProcessBuiltinPDLValue<Operation *> {};
+template <typename T>
+struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<OpState, T>::value>>
+    : public ProcessDerivedPDLValue<T, Operation *> {
+  static T processAsArg(Operation *value) { return cast<T>(value); }
+};
+
+//===----------------------------------------------------------------------===//
+// Type
+
+template <>
+struct ProcessPDLValue<Type> : public ProcessBuiltinPDLValue<Type> {};
+template <typename T>
+struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<Type, T>::value>>
+    : public ProcessDerivedPDLValue<T, Type> {};
+
+//===----------------------------------------------------------------------===//
+// TypeRange
+
+template <>
+struct ProcessPDLValue<TypeRange> : public ProcessBuiltinPDLValue<TypeRange> {};
+template <>
+struct ProcessPDLValue<ValueTypeRange<OperandRange>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ValueTypeRange<OperandRange> types) {
+    results.push_back(types);
+  }
+};
+template <>
+struct ProcessPDLValue<ValueTypeRange<ResultRange>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ValueTypeRange<ResultRange> types) {
+    results.push_back(types);
+  }
+};
+template <unsigned N>
+struct ProcessPDLValue<SmallVector<Type, N>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              SmallVector<Type, N> values) {
+    results.push_back(TypeRange(values));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Value
+
+template <>
+struct ProcessPDLValue<Value> : public ProcessBuiltinPDLValue<Value> {};
+
+//===----------------------------------------------------------------------===//
+// ValueRange
+
+template <>
+struct ProcessPDLValue<ValueRange> : public ProcessBuiltinPDLValue<ValueRange> {
+};
+template <>
+struct ProcessPDLValue<OperandRange> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              OperandRange values) {
+    results.push_back(values);
+  }
+};
+template <>
+struct ProcessPDLValue<ResultRange> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ResultRange values) {
+    results.push_back(values);
+  }
+};
+template <unsigned N>
+struct ProcessPDLValue<SmallVector<Value, N>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              SmallVector<Value, N> values) {
+    results.push_back(ValueRange(values));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Argument Handling
+//===----------------------------------------------------------------------===//
+
+/// Validate the given PDLValues match the constraints defined by the argument
+/// types of the given function. In the case of failure, a match failure
+/// diagnostic is emitted.
+/// FIXME: This should be completely removed in favor of `assertArgs`, but PDL
+/// does not currently preserve Constraint application ordering.
+template <typename PDLFnT, std::size_t... I>
+LogicalResult verifyAsArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
+                           std::index_sequence<I...>) {
+  using FnTraitsT = llvm::function_traits<PDLFnT>;
+
+  auto errorFn = [&](const Twine &msg) {
+    return rewriter.notifyMatchFailure(rewriter.getUnknownLoc(), msg);
+  };
+  return success(
+      (succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                     verifyAsArg(errorFn, values[I], I)) &&
+       ...));
+}
+
+/// Assert that the given PDLValues match the constraints defined by the
+/// arguments of the given function. In the case of failure, a fatal error
+/// is emitted.
+template <typename PDLFnT, std::size_t... I>
+void assertArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
+                std::index_sequence<I...>) {
+  // We only want to do verification in debug builds, same as with `assert`.
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  using FnTraitsT = llvm::function_traits<PDLFnT>;
+  auto errorFn = [&](const Twine &msg) -> LogicalResult {
+    llvm::report_fatal_error(msg);
+  };
+  (void)errorFn;
+  assert((succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                        verifyAsArg(errorFn, values[I], I)) &&
+          ...));
+#endif
+  (void)values;
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Results Handling
+//===----------------------------------------------------------------------===//
+
+/// Store a single result within the result list.
+template <typename T>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results, T &&value) {
+  ProcessPDLValue<T>::processAsResult(rewriter, results,
+                                      std::forward<T>(value));
+  return success();
+}
+
+/// Store a std::pair<> as individual results within the result list.
+template <typename T1, typename T2>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    std::pair<T1, T2> &&pair) {
+  if (failed(processResults(rewriter, results, std::move(pair.first))) ||
+      failed(processResults(rewriter, results, std::move(pair.second))))
+    return failure();
+  return success();
+}
+
+/// Store a std::tuple<> as individual results within the result list.
+template <typename... Ts>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    std::tuple<Ts...> &&tuple) {
+  auto applyFn = [&](auto &&...args) {
+    return (succeeded(processResults(rewriter, results, std::move(args))) &&
+            ...);
+  };
+  return success(std::apply(applyFn, std::move(tuple)));
+}
+
+/// Handle LogicalResult propagation.
+inline LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    LogicalResult &&result) {
+  return result;
+}
+template <typename T>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    FailureOr<T> &&result) {
+  if (failed(result))
+    return failure();
+  return processResults(rewriter, results, std::move(*result));
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Constraint Builder
+//===----------------------------------------------------------------------===//
+
+/// Process the arguments of a native constraint and invoke it.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+typename FnTraitsT::result_t
+processArgsAndInvokeConstraint(PDLFnT &fn, PatternRewriter &rewriter,
+                               ArrayRef<PDLValue> values,
+                               std::index_sequence<I...>) {
+  return fn(
+      rewriter,
+      (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
+          values[I]))...);
+}
+
+/// Build a constraint function from the given function `ConstraintFnT`. This
+/// allows for enabling the user to define simpler, more direct constraint
+/// functions without needing to handle the low-level PDL goop.
+///
+/// If the constraint function is already in the correct form, we just forward
+/// it directly.
+template <typename ConstraintFnT>
+std::enable_if_t<
+    std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
+    PDLConstraintFunction>
+buildConstraintFn(ConstraintFnT &&constraintFn) {
+  return std::forward<ConstraintFnT>(constraintFn);
+}
+/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
+/// we desire.
+template <typename ConstraintFnT>
+std::enable_if_t<
+    !std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
+    PDLConstraintFunction>
+buildConstraintFn(ConstraintFnT &&constraintFn) {
+  return [constraintFn = std::forward<ConstraintFnT>(constraintFn)](
+             PatternRewriter &rewriter,
+             ArrayRef<PDLValue> values) -> LogicalResult {
+    auto argIndices = std::make_index_sequence<
+        llvm::function_traits<ConstraintFnT>::num_args - 1>();
+    if (failed(verifyAsArgs<ConstraintFnT>(rewriter, values, argIndices)))
+      return failure();
+    return processArgsAndInvokeConstraint(constraintFn, rewriter, values,
+                                          argIndices);
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Rewrite Builder
+//===----------------------------------------------------------------------===//
+
+/// Process the arguments of a native rewrite and invoke it.
+/// This overload handles the case of no return values.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+std::enable_if_t<std::is_same<typename FnTraitsT::result_t, void>::value,
+                 LogicalResult>
+processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
+                            PDLResultList &, ArrayRef<PDLValue> values,
+                            std::index_sequence<I...>) {
+  fn(rewriter,
+     (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
+         values[I]))...);
+  return success();
+}
+/// This overload handles the case of return values, which need to be packaged
+/// into the result list.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+std::enable_if_t<!std::is_same<typename FnTraitsT::result_t, void>::value,
+                 LogicalResult>
+processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
+                            PDLResultList &results, ArrayRef<PDLValue> values,
+                            std::index_sequence<I...>) {
+  return processResults(
+      rewriter, results,
+      fn(rewriter, (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                        processAsArg(values[I]))...));
+  (void)values;
+}
+
+/// Build a rewrite function from the given function `RewriteFnT`. This
+/// allows for enabling the user to define simpler, more direct rewrite
+/// functions without needing to handle the low-level PDL goop.
+///
+/// If the rewrite function is already in the correct form, we just forward
+/// it directly.
+template <typename RewriteFnT>
+std::enable_if_t<std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
+                 PDLRewriteFunction>
+buildRewriteFn(RewriteFnT &&rewriteFn) {
+  return std::forward<RewriteFnT>(rewriteFn);
+}
+/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
+/// we desire.
+template <typename RewriteFnT>
+std::enable_if_t<!std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
+                 PDLRewriteFunction>
+buildRewriteFn(RewriteFnT &&rewriteFn) {
+  return [rewriteFn = std::forward<RewriteFnT>(rewriteFn)](
+             PatternRewriter &rewriter, PDLResultList &results,
+             ArrayRef<PDLValue> values) {
+    auto argIndices =
+        std::make_index_sequence<llvm::function_traits<RewriteFnT>::num_args -
+                                 1>();
+    assertArgs<RewriteFnT>(rewriter, values, argIndices);
+    return processArgsAndInvokeRewrite(rewriteFn, rewriter, results, values,
+                                       argIndices);
+  };
+}
+
+} // namespace pdl_function_builder
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+
+/// This class contains all of the necessary data for a set of PDL patterns, or
+/// pattern rewrites specified in the form of the PDL dialect. This PDL module
+/// contained by this pattern may contain any number of `pdl.pattern`
+/// operations.
+class PDLPatternModule {
+public:
+  PDLPatternModule() = default;
+
+  /// Construct a PDL pattern with the given module and configurations.
+  PDLPatternModule(OwningOpRef<ModuleOp> module)
+      : pdlModule(std::move(module)) {}
+  template <typename... ConfigsT>
+  PDLPatternModule(OwningOpRef<ModuleOp> module, ConfigsT &&...patternConfigs)
+      : PDLPatternModule(std::move(module)) {
+    auto configSet = std::make_unique<PDLPatternConfigSet>(
+        std::forward<ConfigsT>(patternConfigs)...);
+    attachConfigToPatterns(*pdlModule, *configSet);
+    configs.emplace_back(std::move(configSet));
+  }
+
+  /// Merge the state in `other` into this pattern module.
+  void mergeIn(PDLPatternModule &&other);
+
+  /// Return the internal PDL module of this pattern.
+  ModuleOp getModule() { return pdlModule.get(); }
+
+  /// Return the MLIR context of this pattern.
+  MLIRContext *getContext() { return getModule()->getContext(); }
+
+  //===--------------------------------------------------------------------===//
+  // Function Registry
+
+  /// Register a constraint function with PDL. A constraint function may be
+  /// specified in one of two ways:
+  ///
+  ///   * `LogicalResult (PatternRewriter &, ArrayRef<PDLValue>)`
+  ///
+  ///   In this overload the arguments of the constraint function are passed via
+  ///   the low-level PDLValue form.
+  ///
+  ///   * `LogicalResult (PatternRewriter &, ValueTs... values)`
+  ///
+  ///   In this form the arguments of the constraint function are passed via the
+  ///   expected high level C++ type. In this form, the framework will
+  ///   automatically unwrap PDLValues and convert them to the expected ValueTs.
+  ///   For example, if the constraint function accepts a `Operation *`, the
+  ///   framework will automatically cast the input PDLValue. In the case of a
+  ///   `StringRef`, the framework will automatically unwrap the argument as a
+  ///   StringAttr and pass the underlying string value. To see the full list of
+  ///   supported types, or to see how to add handling for custom types, view
+  ///   the definition of `ProcessPDLValue` above.
+  void registerConstraintFunction(StringRef name,
+                                  PDLConstraintFunction constraintFn);
+  template <typename ConstraintFnT>
+  void registerConstraintFunction(StringRef name,
+                                  ConstraintFnT &&constraintFn) {
+    registerConstraintFunction(name,
+                               detail::pdl_function_builder::buildConstraintFn(
+                                   std::forward<ConstraintFnT>(constraintFn)));
+  }
+
+  /// Register a rewrite function with PDL. A rewrite function may be specified
+  /// in one of two ways:
+  ///
+  ///   * `void (PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)`
+  ///
+  ///   In this overload the arguments of the constraint function are passed via
+  ///   the low-level PDLValue form, and the results are manually appended to
+  ///   the given result list.
+  ///
+  ///   * `ResultT (PatternRewriter &, ValueTs... values)`
+  ///
+  ///   In this form the arguments and result of the rewrite function are passed
+  ///   via the expected high level C++ type. In this form, the framework will
+  ///   automatically unwrap the PDLValues arguments and convert them to the
+  ///   expected ValueTs. It will also automatically handle the processing and
+  ///   packaging of the result value to the result list. For example, if the
+  ///   rewrite function takes a `Operation *`, the framework will automatically
+  ///   cast the input PDLValue. In the case of a `StringRef`, the framework
+  ///   will automatically unwrap the argument as a StringAttr and pass the
+  ///   underlying string value. In the reverse case, if the rewrite returns a
+  ///   StringRef or std::string, it will automatically package this as a
+  ///   StringAttr and append it to the result list. To see the full list of
+  ///   supported types, or to see how to add handling for custom types, view
+  ///   the definition of `ProcessPDLValue` above.
+  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn);
+  template <typename RewriteFnT>
+  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {
+    registerRewriteFunction(name, detail::pdl_function_builder::buildRewriteFn(
+                                      std::forward<RewriteFnT>(rewriteFn)));
+  }
+
+  /// Return the set of the registered constraint functions.
+  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
+    return constraintFunctions;
+  }
+  llvm::StringMap<PDLConstraintFunction> takeConstraintFunctions() {
+    return constraintFunctions;
+  }
+  /// Return the set of the registered rewrite functions.
+  const llvm::StringMap<PDLRewriteFunction> &getRewriteFunctions() const {
+    return rewriteFunctions;
+  }
+  llvm::StringMap<PDLRewriteFunction> takeRewriteFunctions() {
+    return rewriteFunctions;
+  }
+
+  /// Return the set of the registered pattern configs.
+  SmallVector<std::unique_ptr<PDLPatternConfigSet>> takeConfigs() {
+    return std::move(configs);
+  }
+  DenseMap<Operation *, PDLPatternConfigSet *> takeConfigMap() {
+    return std::move(configMap);
+  }
+
+  /// Clear out the patterns and functions within this module.
+  void clear() {
+    pdlModule = nullptr;
+    constraintFunctions.clear();
+    rewriteFunctions.clear();
+  }
+
+private:
+  /// Attach the given pattern config set to the patterns defined within the
+  /// given module.
+  void attachConfigToPatterns(ModuleOp module, PDLPatternConfigSet &configSet);
+
+  /// The module containing the `pdl.pattern` operations.
+  OwningOpRef<ModuleOp> pdlModule;
+
+  /// The set of configuration sets referenced by patterns within `pdlModule`.
+  SmallVector<std::unique_ptr<PDLPatternConfigSet>> configs;
+  DenseMap<Operation *, PDLPatternConfigSet *> configMap;
+
+  /// The external functions referenced from within the PDL module.
+  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
+  llvm::StringMap<PDLRewriteFunction> rewriteFunctions;
+};
+} // namespace mlir
+
+#else
+
+namespace mlir {
+// Stubs for when PDL in pattern rewrites is not enabled.
+
+class PDLValue {
+public:
+  template <typename T>
+  T dyn_cast() const {
+    return nullptr;
+  }
+};
+class PDLResultList {};
+using PDLConstraintFunction =
+    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
+using PDLRewriteFunction = std::function<LogicalResult(
+    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
+
+class PDLPatternModule {
+public:
+  PDLPatternModule() = default;
+
+  PDLPatternModule(OwningOpRef<ModuleOp> /*module*/) {}
+  MLIRContext *getContext() {
+    llvm_unreachable("Error: PDL for rewrites when PDL is not enabled");
+  }
+  void mergeIn(PDLPatternModule &&other) {}
+  void clear() {}
+  template <typename ConstraintFnT>
+  void registerConstraintFunction(StringRef name,
+                                  ConstraintFnT &&constraintFn) {}
+  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn) {}
+  template <typename RewriteFnT>
+  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {}
+  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
+    return constraintFunctions;
+  }
+
+private:
+  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
+};
+
+} // namespace mlir
+#endif
+
+#endif // MLIR_IR_PDLPATTERNMATCH_H
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 6625ef553eba2..9b4fa65bff49e 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -735,932 +735,12 @@ class PatternRewriter : public RewriterBase {
   virtual bool canRecoverFromRewriteFailure() const { return false; }
 };
 
-//===----------------------------------------------------------------------===//
-// PDL Patterns
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// PDLValue
-
-/// Storage type of byte-code interpreter values. These are passed to constraint
-/// functions as arguments.
-class PDLValue {
-public:
-  /// The underlying kind of a PDL value.
-  enum class Kind { Attribute, Operation, Type, TypeRange, Value, ValueRange };
-
-  /// Construct a new PDL value.
-  PDLValue(const PDLValue &other) = default;
-  PDLValue(std::nullptr_t = nullptr) {}
-  PDLValue(Attribute value)
-      : value(value.getAsOpaquePointer()), kind(Kind::Attribute) {}
-  PDLValue(Operation *value) : value(value), kind(Kind::Operation) {}
-  PDLValue(Type value) : value(value.getAsOpaquePointer()), kind(Kind::Type) {}
-  PDLValue(TypeRange *value) : value(value), kind(Kind::TypeRange) {}
-  PDLValue(Value value)
-      : value(value.getAsOpaquePointer()), kind(Kind::Value) {}
-  PDLValue(ValueRange *value) : value(value), kind(Kind::ValueRange) {}
-
-  /// Returns true if the type of the held value is `T`.
-  template <typename T>
-  bool isa() const {
-    assert(value && "isa<> used on a null value");
-    return kind == getKindOf<T>();
-  }
-
-  /// Attempt to dynamically cast this value to type `T`, returns null if this
-  /// value is not an instance of `T`.
-  template <typename T,
-            typename ResultT = std::conditional_t<
-                std::is_convertible<T, bool>::value, T, std::optional<T>>>
-  ResultT dyn_cast() const {
-    return isa<T>() ? castImpl<T>() : ResultT();
-  }
-
-  /// Cast this value to type `T`, asserts if this value is not an instance of
-  /// `T`.
-  template <typename T>
-  T cast() const {
-    assert(isa<T>() && "expected value to be of type `T`");
-    return castImpl<T>();
-  }
-
-  /// Get an opaque pointer to the value.
-  const void *getAsOpaquePointer() const { return value; }
-
-  /// Return if this value is null or not.
-  explicit operator bool() const { return value; }
-
-  /// Return the kind of this value.
-  Kind getKind() const { return kind; }
-
-  /// Print this value to the provided output stream.
-  void print(raw_ostream &os) const;
-
-  /// Print the specified value kind to an output stream.
-  static void print(raw_ostream &os, Kind kind);
-
-private:
-  /// Find the index of a given type in a range of other types.
-  template <typename...>
-  struct index_of_t;
-  template <typename T, typename... R>
-  struct index_of_t<T, T, R...> : std::integral_constant<size_t, 0> {};
-  template <typename T, typename F, typename... R>
-  struct index_of_t<T, F, R...>
-      : std::integral_constant<size_t, 1 + index_of_t<T, R...>::value> {};
-
-  /// Return the kind used for the given T.
-  template <typename T>
-  static Kind getKindOf() {
-    return static_cast<Kind>(index_of_t<T, Attribute, Operation *, Type,
-                                        TypeRange, Value, ValueRange>::value);
-  }
-
-  /// The internal implementation of `cast`, that returns the underlying value
-  /// as the given type `T`.
-  template <typename T>
-  std::enable_if_t<llvm::is_one_of<T, Attribute, Type, Value>::value, T>
-  castImpl() const {
-    return T::getFromOpaquePointer(value);
-  }
-  template <typename T>
-  std::enable_if_t<llvm::is_one_of<T, TypeRange, ValueRange>::value, T>
-  castImpl() const {
-    return *reinterpret_cast<T *>(const_cast<void *>(value));
-  }
-  template <typename T>
-  std::enable_if_t<std::is_pointer<T>::value, T> castImpl() const {
-    return reinterpret_cast<T>(const_cast<void *>(value));
-  }
-
-  /// The internal opaque representation of a PDLValue.
-  const void *value{nullptr};
-  /// The kind of the opaque value.
-  Kind kind{Kind::Attribute};
-};
-
-inline raw_ostream &operator<<(raw_ostream &os, PDLValue value) {
-  value.print(os);
-  return os;
-}
-
-inline raw_ostream &operator<<(raw_ostream &os, PDLValue::Kind kind) {
-  PDLValue::print(os, kind);
-  return os;
-}
-
-//===----------------------------------------------------------------------===//
-// PDLResultList
-
-/// The class represents a list of PDL results, returned by a native rewrite
-/// method. It provides the mechanism with which to pass PDLValues back to the
-/// PDL bytecode.
-class PDLResultList {
-public:
-  /// Push a new Attribute value onto the result list.
-  void push_back(Attribute value) { results.push_back(value); }
-
-  /// Push a new Operation onto the result list.
-  void push_back(Operation *value) { results.push_back(value); }
-
-  /// Push a new Type onto the result list.
-  void push_back(Type value) { results.push_back(value); }
-
-  /// Push a new TypeRange onto the result list.
-  void push_back(TypeRange value) {
-    // The lifetime of a TypeRange can't be guaranteed, so we'll need to
-    // allocate a storage for it.
-    llvm::OwningArrayRef<Type> storage(value.size());
-    llvm::copy(value, storage.begin());
-    allocatedTypeRanges.emplace_back(std::move(storage));
-    typeRanges.push_back(allocatedTypeRanges.back());
-    results.push_back(&typeRanges.back());
-  }
-  void push_back(ValueTypeRange<OperandRange> value) {
-    typeRanges.push_back(value);
-    results.push_back(&typeRanges.back());
-  }
-  void push_back(ValueTypeRange<ResultRange> value) {
-    typeRanges.push_back(value);
-    results.push_back(&typeRanges.back());
-  }
-
-  /// Push a new Value onto the result list.
-  void push_back(Value value) { results.push_back(value); }
-
-  /// Push a new ValueRange onto the result list.
-  void push_back(ValueRange value) {
-    // The lifetime of a ValueRange can't be guaranteed, so we'll need to
-    // allocate a storage for it.
-    llvm::OwningArrayRef<Value> storage(value.size());
-    llvm::copy(value, storage.begin());
-    allocatedValueRanges.emplace_back(std::move(storage));
-    valueRanges.push_back(allocatedValueRanges.back());
-    results.push_back(&valueRanges.back());
-  }
-  void push_back(OperandRange value) {
-    valueRanges.push_back(value);
-    results.push_back(&valueRanges.back());
-  }
-  void push_back(ResultRange value) {
-    valueRanges.push_back(value);
-    results.push_back(&valueRanges.back());
-  }
-
-protected:
-  /// Create a new result list with the expected number of results.
-  PDLResultList(unsigned maxNumResults) {
-    // For now just reserve enough space for all of the results. We could do
-    // separate counts per range type, but it isn't really worth it unless there
-    // are a "large" number of results.
-    typeRanges.reserve(maxNumResults);
-    valueRanges.reserve(maxNumResults);
-  }
-
-  /// The PDL results held by this list.
-  SmallVector<PDLValue> results;
-  /// Memory used to store ranges held by the list.
-  SmallVector<TypeRange> typeRanges;
-  SmallVector<ValueRange> valueRanges;
-  /// Memory allocated to store ranges in the result list whose lifetime was
-  /// generated in the native function.
-  SmallVector<llvm::OwningArrayRef<Type>> allocatedTypeRanges;
-  SmallVector<llvm::OwningArrayRef<Value>> allocatedValueRanges;
-};
-
-//===----------------------------------------------------------------------===//
-// PDLPatternConfig
-
-/// An individual configuration for a pattern, which can be accessed by native
-/// functions via the PDLPatternConfigSet. This allows for injecting additional
-/// configuration into PDL patterns that is specific to certain compilation
-/// flows.
-class PDLPatternConfig {
-public:
-  virtual ~PDLPatternConfig() = default;
-
-  /// Hooks that are invoked at the beginning and end of a rewrite of a matched
-  /// pattern. These can be used to setup any specific state necessary for the
-  /// rewrite.
-  virtual void notifyRewriteBegin(PatternRewriter &rewriter) {}
-  virtual void notifyRewriteEnd(PatternRewriter &rewriter) {}
-
-  /// Return the TypeID that represents this configuration.
-  TypeID getTypeID() const { return id; }
-
-protected:
-  PDLPatternConfig(TypeID id) : id(id) {}
-
-private:
-  TypeID id;
-};
-
-/// This class provides a base class for users implementing a type of pattern
-/// configuration.
-template <typename T>
-class PDLPatternConfigBase : public PDLPatternConfig {
-public:
-  /// Support LLVM style casting.
-  static bool classof(const PDLPatternConfig *config) {
-    return config->getTypeID() == getConfigID();
-  }
-
-  /// Return the type id used for this configuration.
-  static TypeID getConfigID() { return TypeID::get<T>(); }
-
-protected:
-  PDLPatternConfigBase() : PDLPatternConfig(getConfigID()) {}
-};
-
-/// This class contains a set of configurations for a specific pattern.
-/// Configurations are uniqued by TypeID, meaning that only one configuration of
-/// each type is allowed.
-class PDLPatternConfigSet {
-public:
-  PDLPatternConfigSet() = default;
-
-  /// Construct a set with the given configurations.
-  template <typename... ConfigsT>
-  PDLPatternConfigSet(ConfigsT &&...configs) {
-    (addConfig(std::forward<ConfigsT>(configs)), ...);
-  }
-
-  /// Get the configuration defined by the given type. Asserts that the
-  /// configuration of the provided type exists.
-  template <typename T>
-  const T &get() const {
-    const T *config = tryGet<T>();
-    assert(config && "configuration not found");
-    return *config;
-  }
-
-  /// Get the configuration defined by the given type, returns nullptr if the
-  /// configuration does not exist.
-  template <typename T>
-  const T *tryGet() const {
-    for (const auto &configIt : configs)
-      if (const T *config = dyn_cast<T>(configIt.get()))
-        return config;
-    return nullptr;
-  }
-
-  /// Notify the configurations within this set at the beginning or end of a
-  /// rewrite of a matched pattern.
-  void notifyRewriteBegin(PatternRewriter &rewriter) {
-    for (const auto &config : configs)
-      config->notifyRewriteBegin(rewriter);
-  }
-  void notifyRewriteEnd(PatternRewriter &rewriter) {
-    for (const auto &config : configs)
-      config->notifyRewriteEnd(rewriter);
-  }
-
-protected:
-  /// Add a configuration to the set.
-  template <typename T>
-  void addConfig(T &&config) {
-    assert(!tryGet<std::decay_t<T>>() && "configuration already exists");
-    configs.emplace_back(
-        std::make_unique<std::decay_t<T>>(std::forward<T>(config)));
-  }
-
-  /// The set of configurations for this pattern. This uses a vector instead of
-  /// a map with the expectation that the number of configurations per set is
-  /// small (<= 1).
-  SmallVector<std::unique_ptr<PDLPatternConfig>> configs;
-};
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-
-/// A generic PDL pattern constraint function. This function applies a
-/// constraint to a given set of opaque PDLValue entities. Returns success if
-/// the constraint successfully held, failure otherwise.
-using PDLConstraintFunction =
-    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
-/// A native PDL rewrite function. This function performs a rewrite on the
-/// given set of values. Any results from this rewrite that should be passed
-/// back to PDL should be added to the provided result list. This method is only
-/// invoked when the corresponding match was successful. Returns failure if an
-/// invariant of the rewrite was broken (certain rewriters may recover from
-/// partial pattern application).
-using PDLRewriteFunction = std::function<LogicalResult(
-    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
-
-namespace detail {
-namespace pdl_function_builder {
-/// A utility variable that always resolves to false. This is useful for static
-/// asserts that are always false, but only should fire in certain templated
-/// constructs. For example, if a templated function should never be called, the
-/// function could be defined as:
-///
-/// template <typename T>
-/// void foo() {
-///  static_assert(always_false<T>, "This function should never be called");
-/// }
-///
-template <class... T>
-constexpr bool always_false = false;
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Type Processing
-//===----------------------------------------------------------------------===//
-
-/// This struct provides a convenient way to determine how to process a given
-/// type as either a PDL parameter, or a result value. This allows for
-/// supporting complex types in constraint and rewrite functions, without
-/// requiring the user to hand-write the necessary glue code themselves.
-/// Specializations of this class should implement the following methods to
-/// enable support as a PDL argument or result type:
-///
-///   static LogicalResult verifyAsArg(
-///     function_ref<LogicalResult(const Twine &)> errorFn, PDLValue pdlValue,
-///     size_t argIdx);
-///
-///     * This method verifies that the given PDLValue is valid for use as a
-///       value of `T`.
-///
-///   static T processAsArg(PDLValue pdlValue);
-///
-///     *  This method processes the given PDLValue as a value of `T`.
-///
-///   static void processAsResult(PatternRewriter &, PDLResultList &results,
-///                               const T &value);
-///
-///     *  This method processes the given value of `T` as the result of a
-///        function invocation. The method should package the value into an
-///        appropriate form and append it to the given result list.
-///
-/// If the type `T` is based on a higher order value, consider using
-/// `ProcessPDLValueBasedOn` as a base class of the specialization to simplify
-/// the implementation.
-///
-template <typename T, typename Enable = void>
-struct ProcessPDLValue;
-
-/// This struct provides a simplified model for processing types that are based
-/// on another type, e.g. APInt is based on the handling for IntegerAttr. This
-/// allows for building the necessary processing functions on top of the base
-/// value instead of a PDLValue. Derived users should implement the following
-/// (which subsume the ProcessPDLValue variants):
-///
-///   static LogicalResult verifyAsArg(
-///     function_ref<LogicalResult(const Twine &)> errorFn,
-///     const BaseT &baseValue, size_t argIdx);
-///
-///     * This method verifies that the given PDLValue is valid for use as a
-///       value of `T`.
-///
-///   static T processAsArg(BaseT baseValue);
-///
-///     *  This method processes the given base value as a value of `T`.
-///
-template <typename T, typename BaseT>
-struct ProcessPDLValueBasedOn {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              PDLValue pdlValue, size_t argIdx) {
-    // Verify the base class before continuing.
-    if (failed(ProcessPDLValue<BaseT>::verifyAsArg(errorFn, pdlValue, argIdx)))
-      return failure();
-    return ProcessPDLValue<T>::verifyAsArg(
-        errorFn, ProcessPDLValue<BaseT>::processAsArg(pdlValue), argIdx);
-  }
-  static T processAsArg(PDLValue pdlValue) {
-    return ProcessPDLValue<T>::processAsArg(
-        ProcessPDLValue<BaseT>::processAsArg(pdlValue));
-  }
-
-  /// Explicitly add the expected parent API to ensure the parent class
-  /// implements the necessary API (and doesn't implicitly inherit it from
-  /// somewhere else).
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn, BaseT value,
-              size_t argIdx) {
-    return success();
-  }
-  static T processAsArg(BaseT baseValue);
-};
-
-/// This struct provides a simplified model for processing types that have
-/// "builtin" PDLValue support:
-///   * Attribute, Operation *, Type, TypeRange, ValueRange
-template <typename T>
-struct ProcessBuiltinPDLValue {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              PDLValue pdlValue, size_t argIdx) {
-    if (pdlValue)
-      return success();
-    return errorFn("expected a non-null value for argument " + Twine(argIdx) +
-                   " of type: " + llvm::getTypeName<T>());
-  }
-
-  static T processAsArg(PDLValue pdlValue) { return pdlValue.cast<T>(); }
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              T value) {
-    results.push_back(value);
-  }
-};
-
-/// This struct provides a simplified model for processing types that inherit
-/// from builtin PDLValue types. For example, derived attributes like
-/// IntegerAttr, derived types like IntegerType, derived operations like
-/// ModuleOp, Interfaces, etc.
-template <typename T, typename BaseT>
-struct ProcessDerivedPDLValue : public ProcessPDLValueBasedOn<T, BaseT> {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              BaseT baseValue, size_t argIdx) {
-    return TypeSwitch<BaseT, LogicalResult>(baseValue)
-        .Case([&](T) { return success(); })
-        .Default([&](BaseT) {
-          return errorFn("expected argument " + Twine(argIdx) +
-                         " to be of type: " + llvm::getTypeName<T>());
-        });
-  }
-  using ProcessPDLValueBasedOn<T, BaseT>::verifyAsArg;
-
-  static T processAsArg(BaseT baseValue) {
-    return baseValue.template cast<T>();
-  }
-  using ProcessPDLValueBasedOn<T, BaseT>::processAsArg;
-
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              T value) {
-    results.push_back(value);
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Attribute
-
-template <>
-struct ProcessPDLValue<Attribute> : public ProcessBuiltinPDLValue<Attribute> {};
-template <typename T>
-struct ProcessPDLValue<T,
-                       std::enable_if_t<std::is_base_of<Attribute, T>::value>>
-    : public ProcessDerivedPDLValue<T, Attribute> {};
-
-/// Handling for various Attribute value types.
-template <>
-struct ProcessPDLValue<StringRef>
-    : public ProcessPDLValueBasedOn<StringRef, StringAttr> {
-  static StringRef processAsArg(StringAttr value) { return value.getValue(); }
-  using ProcessPDLValueBasedOn<StringRef, StringAttr>::processAsArg;
-
-  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
-                              StringRef value) {
-    results.push_back(rewriter.getStringAttr(value));
-  }
-};
-template <>
-struct ProcessPDLValue<std::string>
-    : public ProcessPDLValueBasedOn<std::string, StringAttr> {
-  template <typename T>
-  static std::string processAsArg(T value) {
-    static_assert(always_false<T>,
-                  "`std::string` arguments require a string copy, use "
-                  "`StringRef` for string-like arguments instead");
-    return {};
-  }
-  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
-                              StringRef value) {
-    results.push_back(rewriter.getStringAttr(value));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Operation
-
-template <>
-struct ProcessPDLValue<Operation *>
-    : public ProcessBuiltinPDLValue<Operation *> {};
-template <typename T>
-struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<OpState, T>::value>>
-    : public ProcessDerivedPDLValue<T, Operation *> {
-  static T processAsArg(Operation *value) { return cast<T>(value); }
-};
-
-//===----------------------------------------------------------------------===//
-// Type
-
-template <>
-struct ProcessPDLValue<Type> : public ProcessBuiltinPDLValue<Type> {};
-template <typename T>
-struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<Type, T>::value>>
-    : public ProcessDerivedPDLValue<T, Type> {};
-
-//===----------------------------------------------------------------------===//
-// TypeRange
-
-template <>
-struct ProcessPDLValue<TypeRange> : public ProcessBuiltinPDLValue<TypeRange> {};
-template <>
-struct ProcessPDLValue<ValueTypeRange<OperandRange>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ValueTypeRange<OperandRange> types) {
-    results.push_back(types);
-  }
-};
-template <>
-struct ProcessPDLValue<ValueTypeRange<ResultRange>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ValueTypeRange<ResultRange> types) {
-    results.push_back(types);
-  }
-};
-template <unsigned N>
-struct ProcessPDLValue<SmallVector<Type, N>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              SmallVector<Type, N> values) {
-    results.push_back(TypeRange(values));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Value
-
-template <>
-struct ProcessPDLValue<Value> : public ProcessBuiltinPDLValue<Value> {};
-
-//===----------------------------------------------------------------------===//
-// ValueRange
-
-template <>
-struct ProcessPDLValue<ValueRange> : public ProcessBuiltinPDLValue<ValueRange> {
-};
-template <>
-struct ProcessPDLValue<OperandRange> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              OperandRange values) {
-    results.push_back(values);
-  }
-};
-template <>
-struct ProcessPDLValue<ResultRange> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ResultRange values) {
-    results.push_back(values);
-  }
-};
-template <unsigned N>
-struct ProcessPDLValue<SmallVector<Value, N>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              SmallVector<Value, N> values) {
-    results.push_back(ValueRange(values));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Argument Handling
-//===----------------------------------------------------------------------===//
-
-/// Validate the given PDLValues match the constraints defined by the argument
-/// types of the given function. In the case of failure, a match failure
-/// diagnostic is emitted.
-/// FIXME: This should be completely removed in favor of `assertArgs`, but PDL
-/// does not currently preserve Constraint application ordering.
-template <typename PDLFnT, std::size_t... I>
-LogicalResult verifyAsArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
-                           std::index_sequence<I...>) {
-  using FnTraitsT = llvm::function_traits<PDLFnT>;
-
-  auto errorFn = [&](const Twine &msg) {
-    return rewriter.notifyMatchFailure(rewriter.getUnknownLoc(), msg);
-  };
-  return success(
-      (succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                     verifyAsArg(errorFn, values[I], I)) &&
-       ...));
-}
-
-/// Assert that the given PDLValues match the constraints defined by the
-/// arguments of the given function. In the case of failure, a fatal error
-/// is emitted.
-template <typename PDLFnT, std::size_t... I>
-void assertArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
-                std::index_sequence<I...>) {
-  // We only want to do verification in debug builds, same as with `assert`.
-#if LLVM_ENABLE_ABI_BREAKING_CHECKS
-  using FnTraitsT = llvm::function_traits<PDLFnT>;
-  auto errorFn = [&](const Twine &msg) -> LogicalResult {
-    llvm::report_fatal_error(msg);
-  };
-  (void)errorFn;
-  assert((succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                        verifyAsArg(errorFn, values[I], I)) &&
-          ...));
-#endif
-  (void)values;
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Results Handling
-//===----------------------------------------------------------------------===//
-
-/// Store a single result within the result list.
-template <typename T>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results, T &&value) {
-  ProcessPDLValue<T>::processAsResult(rewriter, results,
-                                      std::forward<T>(value));
-  return success();
-}
-
-/// Store a std::pair<> as individual results within the result list.
-template <typename T1, typename T2>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    std::pair<T1, T2> &&pair) {
-  if (failed(processResults(rewriter, results, std::move(pair.first))) ||
-      failed(processResults(rewriter, results, std::move(pair.second))))
-    return failure();
-  return success();
-}
-
-/// Store a std::tuple<> as individual results within the result list.
-template <typename... Ts>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    std::tuple<Ts...> &&tuple) {
-  auto applyFn = [&](auto &&...args) {
-    return (succeeded(processResults(rewriter, results, std::move(args))) &&
-            ...);
-  };
-  return success(std::apply(applyFn, std::move(tuple)));
-}
-
-/// Handle LogicalResult propagation.
-inline LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    LogicalResult &&result) {
-  return result;
-}
-template <typename T>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    FailureOr<T> &&result) {
-  if (failed(result))
-    return failure();
-  return processResults(rewriter, results, std::move(*result));
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Constraint Builder
-//===----------------------------------------------------------------------===//
-
-/// Process the arguments of a native constraint and invoke it.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-typename FnTraitsT::result_t
-processArgsAndInvokeConstraint(PDLFnT &fn, PatternRewriter &rewriter,
-                               ArrayRef<PDLValue> values,
-                               std::index_sequence<I...>) {
-  return fn(
-      rewriter,
-      (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
-          values[I]))...);
-}
-
-/// Build a constraint function from the given function `ConstraintFnT`. This
-/// allows for enabling the user to define simpler, more direct constraint
-/// functions without needing to handle the low-level PDL goop.
-///
-/// If the constraint function is already in the correct form, we just forward
-/// it directly.
-template <typename ConstraintFnT>
-std::enable_if_t<
-    std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
-    PDLConstraintFunction>
-buildConstraintFn(ConstraintFnT &&constraintFn) {
-  return std::forward<ConstraintFnT>(constraintFn);
-}
-/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
-/// we desire.
-template <typename ConstraintFnT>
-std::enable_if_t<
-    !std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
-    PDLConstraintFunction>
-buildConstraintFn(ConstraintFnT &&constraintFn) {
-  return [constraintFn = std::forward<ConstraintFnT>(constraintFn)](
-             PatternRewriter &rewriter,
-             ArrayRef<PDLValue> values) -> LogicalResult {
-    auto argIndices = std::make_index_sequence<
-        llvm::function_traits<ConstraintFnT>::num_args - 1>();
-    if (failed(verifyAsArgs<ConstraintFnT>(rewriter, values, argIndices)))
-      return failure();
-    return processArgsAndInvokeConstraint(constraintFn, rewriter, values,
-                                          argIndices);
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Rewrite Builder
-//===----------------------------------------------------------------------===//
-
-/// Process the arguments of a native rewrite and invoke it.
-/// This overload handles the case of no return values.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-std::enable_if_t<std::is_same<typename FnTraitsT::result_t, void>::value,
-                 LogicalResult>
-processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
-                            PDLResultList &, ArrayRef<PDLValue> values,
-                            std::index_sequence<I...>) {
-  fn(rewriter,
-     (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
-         values[I]))...);
-  return success();
-}
-/// This overload handles the case of return values, which need to be packaged
-/// into the result list.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-std::enable_if_t<!std::is_same<typename FnTraitsT::result_t, void>::value,
-                 LogicalResult>
-processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
-                            PDLResultList &results, ArrayRef<PDLValue> values,
-                            std::index_sequence<I...>) {
-  return processResults(
-      rewriter, results,
-      fn(rewriter, (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                        processAsArg(values[I]))...));
-  (void)values;
-}
-
-/// Build a rewrite function from the given function `RewriteFnT`. This
-/// allows for enabling the user to define simpler, more direct rewrite
-/// functions without needing to handle the low-level PDL goop.
-///
-/// If the rewrite function is already in the correct form, we just forward
-/// it directly.
-template <typename RewriteFnT>
-std::enable_if_t<std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
-                 PDLRewriteFunction>
-buildRewriteFn(RewriteFnT &&rewriteFn) {
-  return std::forward<RewriteFnT>(rewriteFn);
-}
-/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
-/// we desire.
-template <typename RewriteFnT>
-std::enable_if_t<!std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
-                 PDLRewriteFunction>
-buildRewriteFn(RewriteFnT &&rewriteFn) {
-  return [rewriteFn = std::forward<RewriteFnT>(rewriteFn)](
-             PatternRewriter &rewriter, PDLResultList &results,
-             ArrayRef<PDLValue> values) {
-    auto argIndices =
-        std::make_index_sequence<llvm::function_traits<RewriteFnT>::num_args -
-                                 1>();
-    assertArgs<RewriteFnT>(rewriter, values, argIndices);
-    return processArgsAndInvokeRewrite(rewriteFn, rewriter, results, values,
-                                       argIndices);
-  };
-}
-
-} // namespace pdl_function_builder
-} // namespace detail
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-
-/// This class contains all of the necessary data for a set of PDL patterns, or
-/// pattern rewrites specified in the form of the PDL dialect. This PDL module
-/// contained by this pattern may contain any number of `pdl.pattern`
-/// operations.
-class PDLPatternModule {
-public:
-  PDLPatternModule() = default;
-
-  /// Construct a PDL pattern with the given module and configurations.
-  PDLPatternModule(OwningOpRef<ModuleOp> module)
-      : pdlModule(std::move(module)) {}
-  template <typename... ConfigsT>
-  PDLPatternModule(OwningOpRef<ModuleOp> module, ConfigsT &&...patternConfigs)
-      : PDLPatternModule(std::move(module)) {
-    auto configSet = std::make_unique<PDLPatternConfigSet>(
-        std::forward<ConfigsT>(patternConfigs)...);
-    attachConfigToPatterns(*pdlModule, *configSet);
-    configs.emplace_back(std::move(configSet));
-  }
-
-  /// Merge the state in `other` into this pattern module.
-  void mergeIn(PDLPatternModule &&other);
-
-  /// Return the internal PDL module of this pattern.
-  ModuleOp getModule() { return pdlModule.get(); }
-
-  //===--------------------------------------------------------------------===//
-  // Function Registry
-
-  /// Register a constraint function with PDL. A constraint function may be
-  /// specified in one of two ways:
-  ///
-  ///   * `LogicalResult (PatternRewriter &, ArrayRef<PDLValue>)`
-  ///
-  ///   In this overload the arguments of the constraint function are passed via
-  ///   the low-level PDLValue form.
-  ///
-  ///   * `LogicalResult (PatternRewriter &, ValueTs... values)`
-  ///
-  ///   In this form the arguments of the constraint function are passed via the
-  ///   expected high level C++ type. In this form, the framework will
-  ///   automatically unwrap PDLValues and convert them to the expected ValueTs.
-  ///   For example, if the constraint function accepts a `Operation *`, the
-  ///   framework will automatically cast the input PDLValue. In the case of a
-  ///   `StringRef`, the framework will automatically unwrap the argument as a
-  ///   StringAttr and pass the underlying string value. To see the full list of
-  ///   supported types, or to see how to add handling for custom types, view
-  ///   the definition of `ProcessPDLValue` above.
-  void registerConstraintFunction(StringRef name,
-                                  PDLConstraintFunction constraintFn);
-  template <typename ConstraintFnT>
-  void registerConstraintFunction(StringRef name,
-                                  ConstraintFnT &&constraintFn) {
-    registerConstraintFunction(name,
-                               detail::pdl_function_builder::buildConstraintFn(
-                                   std::forward<ConstraintFnT>(constraintFn)));
-  }
-
-  /// Register a rewrite function with PDL. A rewrite function may be specified
-  /// in one of two ways:
-  ///
-  ///   * `void (PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)`
-  ///
-  ///   In this overload the arguments of the constraint function are passed via
-  ///   the low-level PDLValue form, and the results are manually appended to
-  ///   the given result list.
-  ///
-  ///   * `ResultT (PatternRewriter &, ValueTs... values)`
-  ///
-  ///   In this form the arguments and result of the rewrite function are passed
-  ///   via the expected high level C++ type. In this form, the framework will
-  ///   automatically unwrap the PDLValues arguments and convert them to the
-  ///   expected ValueTs. It will also automatically handle the processing and
-  ///   packaging of the result value to the result list. For example, if the
-  ///   rewrite function takes a `Operation *`, the framework will automatically
-  ///   cast the input PDLValue. In the case of a `StringRef`, the framework
-  ///   will automatically unwrap the argument as a StringAttr and pass the
-  ///   underlying string value. In the reverse case, if the rewrite returns a
-  ///   StringRef or std::string, it will automatically package this as a
-  ///   StringAttr and append it to the result list. To see the full list of
-  ///   supported types, or to see how to add handling for custom types, view
-  ///   the definition of `ProcessPDLValue` above.
-  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn);
-  template <typename RewriteFnT>
-  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {
-    registerRewriteFunction(name, detail::pdl_function_builder::buildRewriteFn(
-                                      std::forward<RewriteFnT>(rewriteFn)));
-  }
-
-  /// Return the set of the registered constraint functions.
-  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
-    return constraintFunctions;
-  }
-  llvm::StringMap<PDLConstraintFunction> takeConstraintFunctions() {
-    return constraintFunctions;
-  }
-  /// Return the set of the registered rewrite functions.
-  const llvm::StringMap<PDLRewriteFunction> &getRewriteFunctions() const {
-    return rewriteFunctions;
-  }
-  llvm::StringMap<PDLRewriteFunction> takeRewriteFunctions() {
-    return rewriteFunctions;
-  }
-
-  /// Return the set of the registered pattern configs.
-  SmallVector<std::unique_ptr<PDLPatternConfigSet>> takeConfigs() {
-    return std::move(configs);
-  }
-  DenseMap<Operation *, PDLPatternConfigSet *> takeConfigMap() {
-    return std::move(configMap);
-  }
-
-  /// Clear out the patterns and functions within this module.
-  void clear() {
-    pdlModule = nullptr;
-    constraintFunctions.clear();
-    rewriteFunctions.clear();
-  }
-
-private:
-  /// Attach the given pattern config set to the patterns defined within the
-  /// given module.
-  void attachConfigToPatterns(ModuleOp module, PDLPatternConfigSet &configSet);
-
-  /// The module containing the `pdl.pattern` operations.
-  OwningOpRef<ModuleOp> pdlModule;
+} // namespace mlir
 
-  /// The set of configuration sets referenced by patterns within `pdlModule`.
-  SmallVector<std::unique_ptr<PDLPatternConfigSet>> configs;
-  DenseMap<Operation *, PDLPatternConfigSet *> configMap;
+// Optionally expose PDL pattern matching methods.
+#include "PDLPatternMatch.h.inc"
 
-  /// The external functions referenced from within the PDL module.
-  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
-  llvm::StringMap<PDLRewriteFunction> rewriteFunctions;
-};
+namespace mlir {
 
 //===----------------------------------------------------------------------===//
 // RewritePatternSet
@@ -1679,8 +759,7 @@ class RewritePatternSet {
     nativePatterns.emplace_back(std::move(pattern));
   }
   RewritePatternSet(PDLPatternModule &&pattern)
-      : context(pattern.getModule()->getContext()),
-        pdlPatterns(std::move(pattern)) {}
+      : context(pattern.getContext()), pdlPatterns(std::move(pattern)) {}
 
   MLIRContext *getContext() const { return context; }
 
@@ -1853,6 +932,7 @@ class RewritePatternSet {
     pattern->addDebugLabels(debugLabels);
     nativePatterns.emplace_back(std::move(pattern));
   }
+
   template <typename T, typename... Args>
   std::enable_if_t<std::is_base_of<PDLPatternModule, T>::value>
   addImpl(ArrayRef<StringRef> debugLabels, Args &&...args) {
@@ -1863,6 +943,9 @@ class RewritePatternSet {
 
   MLIRContext *const context;
   NativePatternListT nativePatterns;
+
+  // Patterns expressed with PDL. This will compile to a stub class when PDL is
+  // not enabled.
   PDLPatternModule pdlPatterns;
 };
 
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 6de981d35c8c3..c5725e9c85625 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_TRANSFORMS_DIALECTCONVERSION_H_
 #define MLIR_TRANSFORMS_DIALECTCONVERSION_H_
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -1015,6 +1016,7 @@ class ConversionTarget {
   MLIRContext &ctx;
 };
 
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 //===----------------------------------------------------------------------===//
 // PDL Configuration
 //===----------------------------------------------------------------------===//
@@ -1044,6 +1046,19 @@ class PDLConversionConfig final
 /// Register the dialect conversion PDL functions with the given pattern set.
 void registerConversionPDLFunctions(RewritePatternSet &patterns);
 
+#else
+
+// Stubs for when PDL in rewriting is not enabled.
+
+inline void registerConversionPDLFunctions(RewritePatternSet &patterns) {}
+
+class PDLConversionConfig final {
+public:
+  PDLConversionConfig(const TypeConverter * /*converter*/) {}
+};
+
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
 //===----------------------------------------------------------------------===//
 // Op Conversion Entry Points
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
index 7f7b348b17ae6..be5eb73b91229 100644
--- a/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
@@ -14,7 +14,6 @@ add_mlir_dialect_library(MLIRBufferizationTransformOps
   MLIRFunctionInterfaces
   MLIRLinalgDialect
   MLIRParser
-  MLIRPDLDialect
   MLIRSideEffectInterfaces
   MLIRTransformDialect
   )
diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt
index 21bba11b85117..a155b7c5ecade 100644
--- a/mlir/lib/IR/CMakeLists.txt
+++ b/mlir/lib/IR/CMakeLists.txt
@@ -61,3 +61,10 @@ add_mlir_library(MLIRIR
   LINK_LIBS PUBLIC
   MLIRSupport
   )
+
+if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+  add_subdirectory(PDL)
+  target_link_libraries(MLIRIR PUBLIC
+    MLIRIRPDLPatternMatch)
+endif()
+
diff --git a/mlir/lib/IR/PDL/CMakeLists.txt b/mlir/lib/IR/PDL/CMakeLists.txt
new file mode 100644
index 0000000000000..08b7fe36fac09
--- /dev/null
+++ b/mlir/lib/IR/PDL/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_mlir_library(MLIRIRPDLPatternMatch
+  PDLPatternMatch.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
+)
+
diff --git a/mlir/lib/IR/PDL/PDLPatternMatch.cpp b/mlir/lib/IR/PDL/PDLPatternMatch.cpp
new file mode 100644
index 0000000000000..da07cc462a5a1
--- /dev/null
+++ b/mlir/lib/IR/PDL/PDLPatternMatch.cpp
@@ -0,0 +1,133 @@
+//===- PDLPatternMatch.cpp - Base classes for PDL pattern match
+//------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/RegionKindInterface.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// PDLValue
+//===----------------------------------------------------------------------===//
+
+void PDLValue::print(raw_ostream &os) const {
+  if (!value) {
+    os << "<NULL-PDLValue>";
+    return;
+  }
+  switch (kind) {
+  case Kind::Attribute:
+    os << cast<Attribute>();
+    break;
+  case Kind::Operation:
+    os << *cast<Operation *>();
+    break;
+  case Kind::Type:
+    os << cast<Type>();
+    break;
+  case Kind::TypeRange:
+    llvm::interleaveComma(cast<TypeRange>(), os);
+    break;
+  case Kind::Value:
+    os << cast<Value>();
+    break;
+  case Kind::ValueRange:
+    llvm::interleaveComma(cast<ValueRange>(), os);
+    break;
+  }
+}
+
+void PDLValue::print(raw_ostream &os, Kind kind) {
+  switch (kind) {
+  case Kind::Attribute:
+    os << "Attribute";
+    break;
+  case Kind::Operation:
+    os << "Operation";
+    break;
+  case Kind::Type:
+    os << "Type";
+    break;
+  case Kind::TypeRange:
+    os << "TypeRange";
+    break;
+  case Kind::Value:
+    os << "Value";
+    break;
+  case Kind::ValueRange:
+    os << "ValueRange";
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+//===----------------------------------------------------------------------===//
+
+void PDLPatternModule::mergeIn(PDLPatternModule &&other) {
+  // Ignore the other module if it has no patterns.
+  if (!other.pdlModule)
+    return;
+
+  // Steal the functions and config of the other module.
+  for (auto &it : other.constraintFunctions)
+    registerConstraintFunction(it.first(), std::move(it.second));
+  for (auto &it : other.rewriteFunctions)
+    registerRewriteFunction(it.first(), std::move(it.second));
+  for (auto &it : other.configs)
+    configs.emplace_back(std::move(it));
+  for (auto &it : other.configMap)
+    configMap.insert(it);
+
+  // Steal the other state if we have no patterns.
+  if (!pdlModule) {
+    pdlModule = std::move(other.pdlModule);
+    return;
+  }
+
+  // Merge the pattern operations from the other module into this one.
+  Block *block = pdlModule->getBody();
+  block->getOperations().splice(block->end(),
+                                other.pdlModule->getBody()->getOperations());
+}
+
+void PDLPatternModule::attachConfigToPatterns(ModuleOp module,
+                                              PDLPatternConfigSet &configSet) {
+  // Attach the configuration to the symbols within the module. We only add
+  // to symbols to avoid hardcoding any specific operation names here (given
+  // that we don't depend on any PDL dialect). We can't use
+  // cast<SymbolOpInterface> here because patterns may be optional symbols.
+  module->walk([&](Operation *op) {
+    if (op->hasTrait<SymbolOpInterface::Trait>())
+      configMap[op] = &configSet;
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// Function Registry
+
+void PDLPatternModule::registerConstraintFunction(
+    StringRef name, PDLConstraintFunction constraintFn) {
+  // TODO: Is it possible to diagnose when `name` is already registered to
+  // a function that is not equivalent to `constraintFn`?
+  // Allow existing mappings in the case multiple patterns depend on the same
+  // constraint.
+  constraintFunctions.try_emplace(name, std::move(constraintFn));
+}
+
+void PDLPatternModule::registerRewriteFunction(StringRef name,
+                                               PDLRewriteFunction rewriteFn) {
+  // TODO: Is it possible to diagnose when `name` is already registered to
+  // a function that is not equivalent to `rewriteFn`?
+  // Allow existing mappings in the case multiple patterns depend on the same
+  // rewrite.
+  rewriteFunctions.try_emplace(name, std::move(rewriteFn));
+}
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
index 5e9b9b2a810a4..5e788cdb4897d 100644
--- a/mlir/lib/IR/PatternMatch.cpp
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Config/mlir-config.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/IR/RegionKindInterface.h"
@@ -97,124 +98,6 @@ LogicalResult RewritePattern::match(Operation *op) const {
 /// Out-of-line vtable anchor.
 void RewritePattern::anchor() {}
 
-//===----------------------------------------------------------------------===//
-// PDLValue
-//===----------------------------------------------------------------------===//
-
-void PDLValue::print(raw_ostream &os) const {
-  if (!value) {
-    os << "<NULL-PDLValue>";
-    return;
-  }
-  switch (kind) {
-  case Kind::Attribute:
-    os << cast<Attribute>();
-    break;
-  case Kind::Operation:
-    os << *cast<Operation *>();
-    break;
-  case Kind::Type:
-    os << cast<Type>();
-    break;
-  case Kind::TypeRange:
-    llvm::interleaveComma(cast<TypeRange>(), os);
-    break;
-  case Kind::Value:
-    os << cast<Value>();
-    break;
-  case Kind::ValueRange:
-    llvm::interleaveComma(cast<ValueRange>(), os);
-    break;
-  }
-}
-
-void PDLValue::print(raw_ostream &os, Kind kind) {
-  switch (kind) {
-  case Kind::Attribute:
-    os << "Attribute";
-    break;
-  case Kind::Operation:
-    os << "Operation";
-    break;
-  case Kind::Type:
-    os << "Type";
-    break;
-  case Kind::TypeRange:
-    os << "TypeRange";
-    break;
-  case Kind::Value:
-    os << "Value";
-    break;
-  case Kind::ValueRange:
-    os << "ValueRange";
-    break;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-//===----------------------------------------------------------------------===//
-
-void PDLPatternModule::mergeIn(PDLPatternModule &&other) {
-  // Ignore the other module if it has no patterns.
-  if (!other.pdlModule)
-    return;
-
-  // Steal the functions and config of the other module.
-  for (auto &it : other.constraintFunctions)
-    registerConstraintFunction(it.first(), std::move(it.second));
-  for (auto &it : other.rewriteFunctions)
-    registerRewriteFunction(it.first(), std::move(it.second));
-  for (auto &it : other.configs)
-    configs.emplace_back(std::move(it));
-  for (auto &it : other.configMap)
-    configMap.insert(it);
-
-  // Steal the other state if we have no patterns.
-  if (!pdlModule) {
-    pdlModule = std::move(other.pdlModule);
-    return;
-  }
-
-  // Merge the pattern operations from the other module into this one.
-  Block *block = pdlModule->getBody();
-  block->getOperations().splice(block->end(),
-                                other.pdlModule->getBody()->getOperations());
-}
-
-void PDLPatternModule::attachConfigToPatterns(ModuleOp module,
-                                              PDLPatternConfigSet &configSet) {
-  // Attach the configuration to the symbols within the module. We only add
-  // to symbols to avoid hardcoding any specific operation names here (given
-  // that we don't depend on any PDL dialect). We can't use
-  // cast<SymbolOpInterface> here because patterns may be optional symbols.
-  module->walk([&](Operation *op) {
-    if (op->hasTrait<SymbolOpInterface::Trait>())
-      configMap[op] = &configSet;
-  });
-}
-
-//===----------------------------------------------------------------------===//
-// Function Registry
-
-void PDLPatternModule::registerConstraintFunction(
-    StringRef name, PDLConstraintFunction constraintFn) {
-  // TODO: Is it possible to diagnose when `name` is already registered to
-  // a function that is not equivalent to `constraintFn`?
-  // Allow existing mappings in the case multiple patterns depend on the same
-  // constraint.
-  constraintFunctions.try_emplace(name, std::move(constraintFn));
-}
-
-void PDLPatternModule::registerRewriteFunction(StringRef name,
-                                               PDLRewriteFunction rewriteFn) {
-  // TODO: Is it possible to diagnose when `name` is already registered to
-  // a function that is not equivalent to `rewriteFn`?
-  // Allow existing mappings in the case multiple patterns depend on the same
-  // rewrite.
-  rewriteFunctions.try_emplace(name, std::move(rewriteFn));
-}
-
 //===----------------------------------------------------------------------===//
 // RewriterBase
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Rewrite/ByteCode.h b/mlir/lib/Rewrite/ByteCode.h
index 4d43fe636bd1f..4aceac7ed3a4c 100644
--- a/mlir/lib/Rewrite/ByteCode.h
+++ b/mlir/lib/Rewrite/ByteCode.h
@@ -16,6 +16,8 @@
 
 #include "mlir/IR/PatternMatch.h"
 
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
 namespace mlir {
 namespace pdl_interp {
 class RecordMatchOp;
@@ -224,4 +226,38 @@ class PDLByteCode {
 } // namespace detail
 } // namespace mlir
 
+#else
+
+namespace mlir::detail {
+
+class PDLByteCodeMutableState {
+public:
+  void cleanupAfterMatchAndRewrite() {}
+  void updatePatternBenefit(unsigned patternIndex, PatternBenefit benefit) {}
+};
+
+class PDLByteCodePattern : public Pattern {};
+
+class PDLByteCode {
+public:
+  struct MatchResult {
+    const PDLByteCodePattern *pattern = nullptr;
+    PatternBenefit benefit;
+  };
+
+  void initializeMutableState(PDLByteCodeMutableState &state) const {}
+  void match(Operation *op, PatternRewriter &rewriter,
+             SmallVectorImpl<MatchResult> &matches,
+             PDLByteCodeMutableState &state) const {}
+  LogicalResult rewrite(PatternRewriter &rewriter, const MatchResult &match,
+                        PDLByteCodeMutableState &state) const {
+    return failure();
+  }
+  ArrayRef<PDLByteCodePattern> getPatterns() const { return {}; }
+};
+
+} // namespace mlir::detail
+
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
 #endif // MLIR_REWRITE_BYTECODE_H_
diff --git a/mlir/lib/Rewrite/CMakeLists.txt b/mlir/lib/Rewrite/CMakeLists.txt
index e0395be6cd6f5..a6c39406aa4b3 100644
--- a/mlir/lib/Rewrite/CMakeLists.txt
+++ b/mlir/lib/Rewrite/CMakeLists.txt
@@ -1,5 +1,6 @@
+set(LLVM_OPTIONAL_SOURCES ByteCode.cpp)
+
 add_mlir_library(MLIRRewrite
-  ByteCode.cpp
   FrozenRewritePatternSet.cpp
   PatternApplicator.cpp
 
@@ -11,8 +12,31 @@ add_mlir_library(MLIRRewrite
 
   LINK_LIBS PUBLIC
   MLIRIR
-  MLIRPDLDialect
-  MLIRPDLInterpDialect
-  MLIRPDLToPDLInterp
   MLIRSideEffectInterfaces
   )
+
+if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+  add_mlir_library(MLIRRewritePDL
+    ByteCode.cpp
+
+    ADDITIONAL_HEADER_DIRS
+    ${MLIR_MAIN_INCLUDE_DIR}/mlir/Rewrite
+
+    DEPENDS
+    mlir-generic-headers
+
+    LINK_LIBS PUBLIC
+    MLIRIR
+    MLIRPDLDialect
+    MLIRPDLInterpDialect
+    MLIRPDLToPDLInterp
+    MLIRSideEffectInterfaces
+  )
+
+  target_link_libraries(MLIRRewrite PUBLIC
+    MLIRPDLDialect
+    MLIRPDLInterpDialect
+    MLIRPDLToPDLInterp
+    MLIRRewritePDL)
+endif()
+
diff --git a/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp b/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
index 43840d1e8cec2..17fe02df9f66c 100644
--- a/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
+++ b/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
@@ -8,8 +8,6 @@
 
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "ByteCode.h"
-#include "mlir/Conversion/PDLToPDLInterp/PDLToPDLInterp.h"
-#include "mlir/Dialect/PDL/IR/PDLOps.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
@@ -17,6 +15,11 @@
 
 using namespace mlir;
 
+// Include the PDL rewrite support.
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+#include "mlir/Conversion/PDLToPDLInterp/PDLToPDLInterp.h"
+#include "mlir/Dialect/PDL/IR/PDLOps.h"
+
 static LogicalResult
 convertPDLToPDLInterp(ModuleOp pdlModule,
                       DenseMap<Operation *, PDLPatternConfigSet *> &configMap) {
@@ -48,6 +51,7 @@ convertPDLToPDLInterp(ModuleOp pdlModule,
   pdlModule.getBody()->walk(simplifyFn);
   return success();
 }
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
 //===----------------------------------------------------------------------===//
 // FrozenRewritePatternSet
@@ -121,6 +125,7 @@ FrozenRewritePatternSet::FrozenRewritePatternSet(
     impl->nativeAnyOpPatterns.push_back(std::move(pat));
   }
 
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
   // Generate the bytecode for the PDL patterns if any were provided.
   PDLPatternModule &pdlPatterns = patterns.getPDLPatterns();
   ModuleOp pdlModule = pdlPatterns.getModule();
@@ -137,6 +142,7 @@ FrozenRewritePatternSet::FrozenRewritePatternSet(
       pdlModule, pdlPatterns.takeConfigs(), configMap,
       pdlPatterns.takeConstraintFunctions(),
       pdlPatterns.takeRewriteFunctions());
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 }
 
 FrozenRewritePatternSet::~FrozenRewritePatternSet() = default;
diff --git a/mlir/lib/Rewrite/PatternApplicator.cpp b/mlir/lib/Rewrite/PatternApplicator.cpp
index 08d6ee618ac69..0064eb84aba84 100644
--- a/mlir/lib/Rewrite/PatternApplicator.cpp
+++ b/mlir/lib/Rewrite/PatternApplicator.cpp
@@ -152,7 +152,6 @@ LogicalResult PatternApplicator::matchAndRewrite(
     // Find the next pattern with the highest benefit.
     const Pattern *bestPattern = nullptr;
     unsigned *bestPatternIt = &opIt;
-    const PDLByteCode::MatchResult *pdlMatch = nullptr;
 
     /// Operation specific patterns.
     if (opIt < opE)
@@ -164,6 +163,8 @@ LogicalResult PatternApplicator::matchAndRewrite(
       bestPatternIt = &anyIt;
       bestPattern = anyOpPatterns[anyIt];
     }
+
+    const PDLByteCode::MatchResult *pdlMatch = nullptr;
     /// PDL patterns.
     if (pdlIt < pdlE && (!bestPattern || bestPattern->getBenefit() <
                                              pdlMatches[pdlIt].benefit)) {
@@ -171,6 +172,7 @@ LogicalResult PatternApplicator::matchAndRewrite(
       pdlMatch = &pdlMatches[pdlIt];
       bestPattern = pdlMatch->pattern;
     }
+
     if (!bestPattern)
       break;
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 4d2afe462b928..85433d088dcbf 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Config/mlir-config.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -3312,6 +3313,7 @@ auto ConversionTarget::getOpInfo(OperationName op) const
   return std::nullopt;
 }
 
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 //===----------------------------------------------------------------------===//
 // PDL Configuration
 //===----------------------------------------------------------------------===//
@@ -3382,6 +3384,7 @@ void mlir::registerConversionPDLFunctions(RewritePatternSet &patterns) {
         return std::move(remappedTypes);
       });
 }
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
 //===----------------------------------------------------------------------===//
 // Op Conversion Entry Points
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 3f312164cb1f3..7ec4c8f0963a2 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -97,16 +97,13 @@ set(MLIR_TEST_DEPENDS
   mlir-capi-ir-test
   mlir-capi-llvm-test
   mlir-capi-pass-test
-  mlir-capi-pdl-test
   mlir-capi-quant-test
   mlir-capi-sparse-tensor-test
   mlir-capi-transform-test
   mlir-capi-translation-test
   mlir-linalg-ods-yaml-gen
   mlir-lsp-server
-  mlir-pdll-lsp-server
   mlir-opt
-  mlir-pdll
   mlir-query
   mlir-reduce
   mlir-tblgen
@@ -115,6 +112,12 @@ set(MLIR_TEST_DEPENDS
   tblgen-to-irdl
   )
 
+set(MLIR_TEST_DEPENDS ${MLIR_TEST_DEPENDS}
+  mlir-capi-pdl-test
+  mlir-pdll-lsp-server
+  mlir-pdll
+  )
+
 # The native target may not be enabled, in this case we won't
 # run tests that involves executing on the host: do not build
 # useless binaries.
@@ -159,9 +162,10 @@ if(LLVM_BUILD_EXAMPLES)
     toyc-ch3
     toyc-ch4
     toyc-ch5
+    )
+  list(APPEND MLIR_TEST_DEPENDS
     transform-opt-ch2
     transform-opt-ch3
-    mlir-minimal-opt
     )
   if(MLIR_ENABLE_EXECUTION_ENGINE)
     list(APPEND MLIR_TEST_DEPENDS
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index e032ce7200fbf..2a3a8608db544 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,3 +1,8 @@
+set(LLVM_OPTIONAL_SOURCES 
+  TestDialectConversion.cpp)
+set(MLIRTestTransformsPDLDep)
+set(MLIRTestTransformsPDLSrc)
+if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
 add_mlir_pdll_library(MLIRTestDialectConversionPDLLPatternsIncGen
   TestDialectConversion.pdll
   TestDialectConversionPDLLPatterns.h.inc
@@ -6,17 +11,22 @@ add_mlir_pdll_library(MLIRTestDialectConversionPDLLPatternsIncGen
   ${CMAKE_CURRENT_SOURCE_DIR}/../Dialect/Test
   ${CMAKE_CURRENT_BINARY_DIR}/../Dialect/Test
   )
+  set(MLIRTestTransformsPDLSrc
+    TestDialectConversion.cpp)
+  set(MLIRTestTransformsPDLDep
+    MLIRTestDialectConversionPDLLPatternsIncGen)
+endif()
 
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestTransforms
   TestCommutativityUtils.cpp
   TestConstantFold.cpp
   TestControlFlowSink.cpp
-  TestDialectConversion.cpp
   TestInlining.cpp
   TestIntRangeInference.cpp
   TestMakeIsolatedFromAbove.cpp
   TestTopologicalSort.cpp
+  ${MLIRTestTransformsPDLSrc}
 
   EXCLUDE_FROM_LIBMLIR
 
@@ -24,7 +34,7 @@ add_mlir_library(MLIRTestTransforms
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
 
   DEPENDS
-  MLIRTestDialectConversionPDLLPatternsIncGen
+  ${MLIRTestTransformsPDLDep}
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
diff --git a/mlir/tools/mlir-lsp-server/CMakeLists.txt b/mlir/tools/mlir-lsp-server/CMakeLists.txt
index e90ccf17af17f..9664f6b94844e 100644
--- a/mlir/tools/mlir-lsp-server/CMakeLists.txt
+++ b/mlir/tools/mlir-lsp-server/CMakeLists.txt
@@ -21,10 +21,12 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestIR
     MLIRTestPass
     MLIRTestReducer
+    )
+  set(test_libs
+    ${test_libs}
     MLIRTestRewrite
     MLIRTestTransformDialect
-    MLIRTestTransforms
-    )
+    MLIRTestTransforms)
 endif()
 
 set(LIBS
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index b6ada66d32188..15317a119c154 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -38,16 +38,18 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestIR
     MLIRTestOneToNTypeConversionPass
     MLIRTestPass
-    MLIRTestPDLL
     MLIRTestReducer
-    MLIRTestRewrite
-    MLIRTestTransformDialect
     MLIRTestTransforms
     MLIRTilingInterfaceTestPasses
     MLIRVectorTestPasses
     MLIRTestVectorToSPIRV
     MLIRLLVMTestPasses
     )
+  set(test_libs ${test_libs}
+    MLIRTestPDLL
+    MLIRTestRewrite
+    MLIRTestTransformDialect
+    )
 endif()
 
 set(LIBS
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index f7a5b3183b50b..bf8f3b7aa21d1 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -85,7 +85,9 @@ void registerTestDataLayoutQuery();
 void registerTestDeadCodeAnalysisPass();
 void registerTestDecomposeCallGraphTypes();
 void registerTestDiagnosticsPass();
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 void registerTestDialectConversionPasses();
+#endif
 void registerTestDominancePass();
 void registerTestDynamicPipelinePass();
 void registerTestEmulateNarrowTypePass();
@@ -147,8 +149,8 @@ void registerTestNvgpuLowerings();
 
 namespace test {
 void registerTestDialect(DialectRegistry &);
-void registerTestTransformDialectExtension(DialectRegistry &);
 void registerTestDynDialect(DialectRegistry &);
+void registerTestTransformDialectExtension(DialectRegistry &);
 } // namespace test
 
 #ifdef MLIR_INCLUDE_TESTS
@@ -260,6 +262,9 @@ void registerTestPasses() {
   mlir::test::registerTestVectorReductionToSPIRVDotProd();
   mlir::test::registerTestNvgpuLowerings();
   mlir::test::registerTestWrittenToPass();
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+  mlir::test::registerTestDialectConversionPasses();
+#endif
 }
 #endif
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2a56b2d6f0373..2a72bf965e544 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -35,6 +35,7 @@ expand_template(
     substitutions = {
         "#cmakedefine01 MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS": "#define MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS 0",
         "#cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}": "/* #undef MLIR_GREEDY_REWRITE_RANDOMIZER_SEED */",
+        "#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH": "#define MLIR_ENABLE_PDL_IN_PATTERNMATCH 1",
     },
     template = "include/mlir/Config/mlir-config.h.cmake",
 )
@@ -318,11 +319,13 @@ cc_library(
     srcs = glob([
         "lib/IR/*.cpp",
         "lib/IR/*.h",
+        "lib/IR/PDL/*.cpp",
         "lib/Bytecode/Reader/*.h",
         "lib/Bytecode/Writer/*.h",
         "lib/Bytecode/*.h",
     ]) + [
         "lib/Bytecode/BytecodeOpInterface.cpp",
+        "include/mlir/IR/PDLPatternMatch.h.inc",
     ],
     hdrs = glob([
         "include/mlir/IR/*.h",
@@ -345,6 +348,7 @@ cc_library(
         ":BuiltinTypesIncGen",
         ":BytecodeOpInterfaceIncGen",
         ":CallOpInterfacesIncGen",
+        ":config",
         ":DataLayoutInterfacesIncGen",
         ":InferTypeOpInterfaceIncGen",
         ":OpAsmInterfaceIncGen",

From b49e0ebedfefa2f3323081425016ca2ada902263 Mon Sep 17 00:00:00 2001
From: max <maksim.levental@gmail.com>
Date: Wed, 3 Jan 2024 12:16:19 -0600
Subject: [PATCH 160/313] Revert "[mlir] Add config for PDL (#69927)"

This reverts commit 5930725c891b60f5fb94058c6c08a55a2e03d83e.
---
 mlir/CMakeLists.txt                           |   5 +-
 mlir/examples/minimal-opt/README.md           |   9 +-
 mlir/include/mlir/Config/mlir-config.h.cmake  |   3 -
 .../Conversion/LLVMCommon/TypeConverter.h     |   1 -
 .../mlir/Dialect/Vector/IR/VectorOps.h        |   1 -
 mlir/include/mlir/IR/PDLPatternMatch.h.inc    | 995 ------------------
 mlir/include/mlir/IR/PatternMatch.h           | 935 +++++++++++++++-
 .../mlir/Transforms/DialectConversion.h       |  15 -
 .../Bufferization/TransformOps/CMakeLists.txt |   1 +
 mlir/lib/IR/CMakeLists.txt                    |   7 -
 mlir/lib/IR/PDL/CMakeLists.txt                |   7 -
 mlir/lib/IR/PDL/PDLPatternMatch.cpp           | 133 ---
 mlir/lib/IR/PatternMatch.cpp                  | 119 ++-
 mlir/lib/Rewrite/ByteCode.h                   |  36 -
 mlir/lib/Rewrite/CMakeLists.txt               |  32 +-
 mlir/lib/Rewrite/FrozenRewritePatternSet.cpp  |  10 +-
 mlir/lib/Rewrite/PatternApplicator.cpp        |   4 +-
 .../Transforms/Utils/DialectConversion.cpp    |   3 -
 mlir/test/CMakeLists.txt                      |  12 +-
 mlir/test/lib/Transforms/CMakeLists.txt       |  14 +-
 mlir/tools/mlir-lsp-server/CMakeLists.txt     |   6 +-
 mlir/tools/mlir-opt/CMakeLists.txt            |   8 +-
 mlir/tools/mlir-opt/mlir-opt.cpp              |   7 +-
 .../llvm-project-overlay/mlir/BUILD.bazel     |   4 -
 24 files changed, 1070 insertions(+), 1297 deletions(-)
 delete mode 100644 mlir/include/mlir/IR/PDLPatternMatch.h.inc
 delete mode 100644 mlir/lib/IR/PDL/CMakeLists.txt
 delete mode 100644 mlir/lib/IR/PDL/PDLPatternMatch.cpp

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 3de8677aefe90..dcc068e4097c5 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -133,8 +133,6 @@ set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
     "Statically link the nvptxlibrary instead of calling ptxas as a subprocess \
     for compiling PTX to cubin")
 
-set(MLIR_ENABLE_PDL_IN_PATTERNMATCH 1 CACHE BOOL "Enable PDL in PatternMatch")
-
 option(MLIR_INCLUDE_TESTS
        "Generate build targets for the MLIR unit tests."
        ${LLVM_INCLUDE_TESTS})
@@ -180,9 +178,10 @@ include_directories( ${MLIR_INCLUDE_DIR})
 # Adding tools/mlir-tblgen here as calling add_tablegen sets some variables like
 # MLIR_TABLEGEN_EXE in PARENT_SCOPE which gets lost if that folder is included
 # from another directory like tools
+add_subdirectory(tools/mlir-tblgen)
 add_subdirectory(tools/mlir-linalg-ods-gen)
 add_subdirectory(tools/mlir-pdll)
-add_subdirectory(tools/mlir-tblgen)
+
 set(MLIR_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_TABLEGEN_TARGET "${MLIR_TABLEGEN_TARGET}" CACHE INTERNAL "")
 set(MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE "${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE}" CACHE INTERNAL "")
diff --git a/mlir/examples/minimal-opt/README.md b/mlir/examples/minimal-opt/README.md
index 1bc54b8367cc5..b8a455f7a7966 100644
--- a/mlir/examples/minimal-opt/README.md
+++ b/mlir/examples/minimal-opt/README.md
@@ -14,10 +14,10 @@ Below are some example measurements taken at the time of the LLVM 17 release,
 using clang-14 on a X86 Ubuntu and [bloaty](https://github.com/google/bloaty).
 
 |                                  | Base   | Os     | Oz     | Os LTO | Oz LTO |
-| :------------------------------: | ------ | ------ | ------ | ------ | ------ |
-| `mlir-cat`                       | 1024KB |  840KB |  885KB |  706KB |  657KB |
-| `mlir-minimal-opt`               | 1.62MB | 1.32MB | 1.36MB | 1.17MB | 1.07MB |
-| `mlir-minimal-opt-canonicalize`  | 1.83MB | 1.40MB | 1.45MB | 1.25MB | 1.14MB |
+| :-----------------------------: | ------ | ------ | ------ | ------ | ------ |
+| `mlir-cat`                      | 1018kB | 836KB  | 879KB  | 697KB  | 649KB  |
+| `mlir-minimal-opt`              | 1.54MB | 1.25MB | 1.29MB | 1.10MB | 1.00MB |
+| `mlir-minimal-opt-canonicalize` | 2.24MB | 1.81MB | 1.86MB | 1.62MB | 1.48MB |
 
 Base configuration:
 
@@ -32,7 +32,6 @@ cmake ../llvm/ -G Ninja \
    -DCMAKE_CXX_COMPILER=clang++ \
    -DLLVM_ENABLE_LLD=ON \
    -DLLVM_ENABLE_BACKTRACES=OFF \
-   -DMLIR_ENABLE_PDL_IN_PATTERNMATCH=OFF \
    -DCMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=-Wl,-icf=all
 ```
 
diff --git a/mlir/include/mlir/Config/mlir-config.h.cmake b/mlir/include/mlir/Config/mlir-config.h.cmake
index e152a36c0ce0c..efa77b2e5ce5d 100644
--- a/mlir/include/mlir/Config/mlir-config.h.cmake
+++ b/mlir/include/mlir/Config/mlir-config.h.cmake
@@ -26,7 +26,4 @@
    numeric seed that is passed to the random number generator. */
 #cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}
 
-/* If set, enables PDL usage. */
-#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH
-
 #endif
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
index e228229302cff..74f9c977b7028 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
@@ -15,7 +15,6 @@
 #define MLIR_CONVERSION_LLVMCOMMON_TYPECONVERTER_H
 
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
-#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
index 4603953cb40fa..a28b27e4e1581 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
@@ -29,7 +29,6 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 
 // Pull in all enum type definitions and utility function declarations.
diff --git a/mlir/include/mlir/IR/PDLPatternMatch.h.inc b/mlir/include/mlir/IR/PDLPatternMatch.h.inc
deleted file mode 100644
index a215da8cb6431..0000000000000
--- a/mlir/include/mlir/IR/PDLPatternMatch.h.inc
+++ /dev/null
@@ -1,995 +0,0 @@
-//===- PDLPatternMatch.h - PDLPatternMatcher classes -------==---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_IR_PDLPATTERNMATCH_H
-#define MLIR_IR_PDLPATTERNMATCH_H
-
-#include "mlir/Config/mlir-config.h"
-
-#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-
-namespace mlir {
-//===----------------------------------------------------------------------===//
-// PDL Patterns
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// PDLValue
-
-/// Storage type of byte-code interpreter values. These are passed to constraint
-/// functions as arguments.
-class PDLValue {
-public:
-  /// The underlying kind of a PDL value.
-  enum class Kind { Attribute, Operation, Type, TypeRange, Value, ValueRange };
-
-  /// Construct a new PDL value.
-  PDLValue(const PDLValue &other) = default;
-  PDLValue(std::nullptr_t = nullptr) {}
-  PDLValue(Attribute value)
-      : value(value.getAsOpaquePointer()), kind(Kind::Attribute) {}
-  PDLValue(Operation *value) : value(value), kind(Kind::Operation) {}
-  PDLValue(Type value) : value(value.getAsOpaquePointer()), kind(Kind::Type) {}
-  PDLValue(TypeRange *value) : value(value), kind(Kind::TypeRange) {}
-  PDLValue(Value value)
-      : value(value.getAsOpaquePointer()), kind(Kind::Value) {}
-  PDLValue(ValueRange *value) : value(value), kind(Kind::ValueRange) {}
-
-  /// Returns true if the type of the held value is `T`.
-  template <typename T>
-  bool isa() const {
-    assert(value && "isa<> used on a null value");
-    return kind == getKindOf<T>();
-  }
-
-  /// Attempt to dynamically cast this value to type `T`, returns null if this
-  /// value is not an instance of `T`.
-  template <typename T,
-            typename ResultT = std::conditional_t<
-                std::is_convertible<T, bool>::value, T, std::optional<T>>>
-  ResultT dyn_cast() const {
-    return isa<T>() ? castImpl<T>() : ResultT();
-  }
-
-  /// Cast this value to type `T`, asserts if this value is not an instance of
-  /// `T`.
-  template <typename T>
-  T cast() const {
-    assert(isa<T>() && "expected value to be of type `T`");
-    return castImpl<T>();
-  }
-
-  /// Get an opaque pointer to the value.
-  const void *getAsOpaquePointer() const { return value; }
-
-  /// Return if this value is null or not.
-  explicit operator bool() const { return value; }
-
-  /// Return the kind of this value.
-  Kind getKind() const { return kind; }
-
-  /// Print this value to the provided output stream.
-  void print(raw_ostream &os) const;
-
-  /// Print the specified value kind to an output stream.
-  static void print(raw_ostream &os, Kind kind);
-
-private:
-  /// Find the index of a given type in a range of other types.
-  template <typename...>
-  struct index_of_t;
-  template <typename T, typename... R>
-  struct index_of_t<T, T, R...> : std::integral_constant<size_t, 0> {};
-  template <typename T, typename F, typename... R>
-  struct index_of_t<T, F, R...>
-      : std::integral_constant<size_t, 1 + index_of_t<T, R...>::value> {};
-
-  /// Return the kind used for the given T.
-  template <typename T>
-  static Kind getKindOf() {
-    return static_cast<Kind>(index_of_t<T, Attribute, Operation *, Type,
-                                        TypeRange, Value, ValueRange>::value);
-  }
-
-  /// The internal implementation of `cast`, that returns the underlying value
-  /// as the given type `T`.
-  template <typename T>
-  std::enable_if_t<llvm::is_one_of<T, Attribute, Type, Value>::value, T>
-  castImpl() const {
-    return T::getFromOpaquePointer(value);
-  }
-  template <typename T>
-  std::enable_if_t<llvm::is_one_of<T, TypeRange, ValueRange>::value, T>
-  castImpl() const {
-    return *reinterpret_cast<T *>(const_cast<void *>(value));
-  }
-  template <typename T>
-  std::enable_if_t<std::is_pointer<T>::value, T> castImpl() const {
-    return reinterpret_cast<T>(const_cast<void *>(value));
-  }
-
-  /// The internal opaque representation of a PDLValue.
-  const void *value{nullptr};
-  /// The kind of the opaque value.
-  Kind kind{Kind::Attribute};
-};
-
-inline raw_ostream &operator<<(raw_ostream &os, PDLValue value) {
-  value.print(os);
-  return os;
-}
-
-inline raw_ostream &operator<<(raw_ostream &os, PDLValue::Kind kind) {
-  PDLValue::print(os, kind);
-  return os;
-}
-
-//===----------------------------------------------------------------------===//
-// PDLResultList
-
-/// The class represents a list of PDL results, returned by a native rewrite
-/// method. It provides the mechanism with which to pass PDLValues back to the
-/// PDL bytecode.
-class PDLResultList {
-public:
-  /// Push a new Attribute value onto the result list.
-  void push_back(Attribute value) { results.push_back(value); }
-
-  /// Push a new Operation onto the result list.
-  void push_back(Operation *value) { results.push_back(value); }
-
-  /// Push a new Type onto the result list.
-  void push_back(Type value) { results.push_back(value); }
-
-  /// Push a new TypeRange onto the result list.
-  void push_back(TypeRange value) {
-    // The lifetime of a TypeRange can't be guaranteed, so we'll need to
-    // allocate a storage for it.
-    llvm::OwningArrayRef<Type> storage(value.size());
-    llvm::copy(value, storage.begin());
-    allocatedTypeRanges.emplace_back(std::move(storage));
-    typeRanges.push_back(allocatedTypeRanges.back());
-    results.push_back(&typeRanges.back());
-  }
-  void push_back(ValueTypeRange<OperandRange> value) {
-    typeRanges.push_back(value);
-    results.push_back(&typeRanges.back());
-  }
-  void push_back(ValueTypeRange<ResultRange> value) {
-    typeRanges.push_back(value);
-    results.push_back(&typeRanges.back());
-  }
-
-  /// Push a new Value onto the result list.
-  void push_back(Value value) { results.push_back(value); }
-
-  /// Push a new ValueRange onto the result list.
-  void push_back(ValueRange value) {
-    // The lifetime of a ValueRange can't be guaranteed, so we'll need to
-    // allocate a storage for it.
-    llvm::OwningArrayRef<Value> storage(value.size());
-    llvm::copy(value, storage.begin());
-    allocatedValueRanges.emplace_back(std::move(storage));
-    valueRanges.push_back(allocatedValueRanges.back());
-    results.push_back(&valueRanges.back());
-  }
-  void push_back(OperandRange value) {
-    valueRanges.push_back(value);
-    results.push_back(&valueRanges.back());
-  }
-  void push_back(ResultRange value) {
-    valueRanges.push_back(value);
-    results.push_back(&valueRanges.back());
-  }
-
-protected:
-  /// Create a new result list with the expected number of results.
-  PDLResultList(unsigned maxNumResults) {
-    // For now just reserve enough space for all of the results. We could do
-    // separate counts per range type, but it isn't really worth it unless there
-    // are a "large" number of results.
-    typeRanges.reserve(maxNumResults);
-    valueRanges.reserve(maxNumResults);
-  }
-
-  /// The PDL results held by this list.
-  SmallVector<PDLValue> results;
-  /// Memory used to store ranges held by the list.
-  SmallVector<TypeRange> typeRanges;
-  SmallVector<ValueRange> valueRanges;
-  /// Memory allocated to store ranges in the result list whose lifetime was
-  /// generated in the native function.
-  SmallVector<llvm::OwningArrayRef<Type>> allocatedTypeRanges;
-  SmallVector<llvm::OwningArrayRef<Value>> allocatedValueRanges;
-};
-
-//===----------------------------------------------------------------------===//
-// PDLPatternConfig
-
-/// An individual configuration for a pattern, which can be accessed by native
-/// functions via the PDLPatternConfigSet. This allows for injecting additional
-/// configuration into PDL patterns that is specific to certain compilation
-/// flows.
-class PDLPatternConfig {
-public:
-  virtual ~PDLPatternConfig() = default;
-
-  /// Hooks that are invoked at the beginning and end of a rewrite of a matched
-  /// pattern. These can be used to setup any specific state necessary for the
-  /// rewrite.
-  virtual void notifyRewriteBegin(PatternRewriter &rewriter) {}
-  virtual void notifyRewriteEnd(PatternRewriter &rewriter) {}
-
-  /// Return the TypeID that represents this configuration.
-  TypeID getTypeID() const { return id; }
-
-protected:
-  PDLPatternConfig(TypeID id) : id(id) {}
-
-private:
-  TypeID id;
-};
-
-/// This class provides a base class for users implementing a type of pattern
-/// configuration.
-template <typename T>
-class PDLPatternConfigBase : public PDLPatternConfig {
-public:
-  /// Support LLVM style casting.
-  static bool classof(const PDLPatternConfig *config) {
-    return config->getTypeID() == getConfigID();
-  }
-
-  /// Return the type id used for this configuration.
-  static TypeID getConfigID() { return TypeID::get<T>(); }
-
-protected:
-  PDLPatternConfigBase() : PDLPatternConfig(getConfigID()) {}
-};
-
-/// This class contains a set of configurations for a specific pattern.
-/// Configurations are uniqued by TypeID, meaning that only one configuration of
-/// each type is allowed.
-class PDLPatternConfigSet {
-public:
-  PDLPatternConfigSet() = default;
-
-  /// Construct a set with the given configurations.
-  template <typename... ConfigsT>
-  PDLPatternConfigSet(ConfigsT &&...configs) {
-    (addConfig(std::forward<ConfigsT>(configs)), ...);
-  }
-
-  /// Get the configuration defined by the given type. Asserts that the
-  /// configuration of the provided type exists.
-  template <typename T>
-  const T &get() const {
-    const T *config = tryGet<T>();
-    assert(config && "configuration not found");
-    return *config;
-  }
-
-  /// Get the configuration defined by the given type, returns nullptr if the
-  /// configuration does not exist.
-  template <typename T>
-  const T *tryGet() const {
-    for (const auto &configIt : configs)
-      if (const T *config = dyn_cast<T>(configIt.get()))
-        return config;
-    return nullptr;
-  }
-
-  /// Notify the configurations within this set at the beginning or end of a
-  /// rewrite of a matched pattern.
-  void notifyRewriteBegin(PatternRewriter &rewriter) {
-    for (const auto &config : configs)
-      config->notifyRewriteBegin(rewriter);
-  }
-  void notifyRewriteEnd(PatternRewriter &rewriter) {
-    for (const auto &config : configs)
-      config->notifyRewriteEnd(rewriter);
-  }
-
-protected:
-  /// Add a configuration to the set.
-  template <typename T>
-  void addConfig(T &&config) {
-    assert(!tryGet<std::decay_t<T>>() && "configuration already exists");
-    configs.emplace_back(
-        std::make_unique<std::decay_t<T>>(std::forward<T>(config)));
-  }
-
-  /// The set of configurations for this pattern. This uses a vector instead of
-  /// a map with the expectation that the number of configurations per set is
-  /// small (<= 1).
-  SmallVector<std::unique_ptr<PDLPatternConfig>> configs;
-};
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-
-/// A generic PDL pattern constraint function. This function applies a
-/// constraint to a given set of opaque PDLValue entities. Returns success if
-/// the constraint successfully held, failure otherwise.
-using PDLConstraintFunction =
-    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
-/// A native PDL rewrite function. This function performs a rewrite on the
-/// given set of values. Any results from this rewrite that should be passed
-/// back to PDL should be added to the provided result list. This method is only
-/// invoked when the corresponding match was successful. Returns failure if an
-/// invariant of the rewrite was broken (certain rewriters may recover from
-/// partial pattern application).
-using PDLRewriteFunction = std::function<LogicalResult(
-    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
-
-namespace detail {
-namespace pdl_function_builder {
-/// A utility variable that always resolves to false. This is useful for static
-/// asserts that are always false, but only should fire in certain templated
-/// constructs. For example, if a templated function should never be called, the
-/// function could be defined as:
-///
-/// template <typename T>
-/// void foo() {
-///  static_assert(always_false<T>, "This function should never be called");
-/// }
-///
-template <class... T>
-constexpr bool always_false = false;
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Type Processing
-//===----------------------------------------------------------------------===//
-
-/// This struct provides a convenient way to determine how to process a given
-/// type as either a PDL parameter, or a result value. This allows for
-/// supporting complex types in constraint and rewrite functions, without
-/// requiring the user to hand-write the necessary glue code themselves.
-/// Specializations of this class should implement the following methods to
-/// enable support as a PDL argument or result type:
-///
-///   static LogicalResult verifyAsArg(
-///     function_ref<LogicalResult(const Twine &)> errorFn, PDLValue pdlValue,
-///     size_t argIdx);
-///
-///     * This method verifies that the given PDLValue is valid for use as a
-///       value of `T`.
-///
-///   static T processAsArg(PDLValue pdlValue);
-///
-///     *  This method processes the given PDLValue as a value of `T`.
-///
-///   static void processAsResult(PatternRewriter &, PDLResultList &results,
-///                               const T &value);
-///
-///     *  This method processes the given value of `T` as the result of a
-///        function invocation. The method should package the value into an
-///        appropriate form and append it to the given result list.
-///
-/// If the type `T` is based on a higher order value, consider using
-/// `ProcessPDLValueBasedOn` as a base class of the specialization to simplify
-/// the implementation.
-///
-template <typename T, typename Enable = void>
-struct ProcessPDLValue;
-
-/// This struct provides a simplified model for processing types that are based
-/// on another type, e.g. APInt is based on the handling for IntegerAttr. This
-/// allows for building the necessary processing functions on top of the base
-/// value instead of a PDLValue. Derived users should implement the following
-/// (which subsume the ProcessPDLValue variants):
-///
-///   static LogicalResult verifyAsArg(
-///     function_ref<LogicalResult(const Twine &)> errorFn,
-///     const BaseT &baseValue, size_t argIdx);
-///
-///     * This method verifies that the given PDLValue is valid for use as a
-///       value of `T`.
-///
-///   static T processAsArg(BaseT baseValue);
-///
-///     *  This method processes the given base value as a value of `T`.
-///
-template <typename T, typename BaseT>
-struct ProcessPDLValueBasedOn {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              PDLValue pdlValue, size_t argIdx) {
-    // Verify the base class before continuing.
-    if (failed(ProcessPDLValue<BaseT>::verifyAsArg(errorFn, pdlValue, argIdx)))
-      return failure();
-    return ProcessPDLValue<T>::verifyAsArg(
-        errorFn, ProcessPDLValue<BaseT>::processAsArg(pdlValue), argIdx);
-  }
-  static T processAsArg(PDLValue pdlValue) {
-    return ProcessPDLValue<T>::processAsArg(
-        ProcessPDLValue<BaseT>::processAsArg(pdlValue));
-  }
-
-  /// Explicitly add the expected parent API to ensure the parent class
-  /// implements the necessary API (and doesn't implicitly inherit it from
-  /// somewhere else).
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn, BaseT value,
-              size_t argIdx) {
-    return success();
-  }
-  static T processAsArg(BaseT baseValue);
-};
-
-/// This struct provides a simplified model for processing types that have
-/// "builtin" PDLValue support:
-///   * Attribute, Operation *, Type, TypeRange, ValueRange
-template <typename T>
-struct ProcessBuiltinPDLValue {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              PDLValue pdlValue, size_t argIdx) {
-    if (pdlValue)
-      return success();
-    return errorFn("expected a non-null value for argument " + Twine(argIdx) +
-                   " of type: " + llvm::getTypeName<T>());
-  }
-
-  static T processAsArg(PDLValue pdlValue) { return pdlValue.cast<T>(); }
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              T value) {
-    results.push_back(value);
-  }
-};
-
-/// This struct provides a simplified model for processing types that inherit
-/// from builtin PDLValue types. For example, derived attributes like
-/// IntegerAttr, derived types like IntegerType, derived operations like
-/// ModuleOp, Interfaces, etc.
-template <typename T, typename BaseT>
-struct ProcessDerivedPDLValue : public ProcessPDLValueBasedOn<T, BaseT> {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              BaseT baseValue, size_t argIdx) {
-    return TypeSwitch<BaseT, LogicalResult>(baseValue)
-        .Case([&](T) { return success(); })
-        .Default([&](BaseT) {
-          return errorFn("expected argument " + Twine(argIdx) +
-                         " to be of type: " + llvm::getTypeName<T>());
-        });
-  }
-  using ProcessPDLValueBasedOn<T, BaseT>::verifyAsArg;
-
-  static T processAsArg(BaseT baseValue) {
-    return baseValue.template cast<T>();
-  }
-  using ProcessPDLValueBasedOn<T, BaseT>::processAsArg;
-
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              T value) {
-    results.push_back(value);
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Attribute
-
-template <>
-struct ProcessPDLValue<Attribute> : public ProcessBuiltinPDLValue<Attribute> {};
-template <typename T>
-struct ProcessPDLValue<T,
-                       std::enable_if_t<std::is_base_of<Attribute, T>::value>>
-    : public ProcessDerivedPDLValue<T, Attribute> {};
-
-/// Handling for various Attribute value types.
-template <>
-struct ProcessPDLValue<StringRef>
-    : public ProcessPDLValueBasedOn<StringRef, StringAttr> {
-  static StringRef processAsArg(StringAttr value) { return value.getValue(); }
-  using ProcessPDLValueBasedOn<StringRef, StringAttr>::processAsArg;
-
-  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
-                              StringRef value) {
-    results.push_back(rewriter.getStringAttr(value));
-  }
-};
-template <>
-struct ProcessPDLValue<std::string>
-    : public ProcessPDLValueBasedOn<std::string, StringAttr> {
-  template <typename T>
-  static std::string processAsArg(T value) {
-    static_assert(always_false<T>,
-                  "`std::string` arguments require a string copy, use "
-                  "`StringRef` for string-like arguments instead");
-    return {};
-  }
-  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
-                              StringRef value) {
-    results.push_back(rewriter.getStringAttr(value));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Operation
-
-template <>
-struct ProcessPDLValue<Operation *>
-    : public ProcessBuiltinPDLValue<Operation *> {};
-template <typename T>
-struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<OpState, T>::value>>
-    : public ProcessDerivedPDLValue<T, Operation *> {
-  static T processAsArg(Operation *value) { return cast<T>(value); }
-};
-
-//===----------------------------------------------------------------------===//
-// Type
-
-template <>
-struct ProcessPDLValue<Type> : public ProcessBuiltinPDLValue<Type> {};
-template <typename T>
-struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<Type, T>::value>>
-    : public ProcessDerivedPDLValue<T, Type> {};
-
-//===----------------------------------------------------------------------===//
-// TypeRange
-
-template <>
-struct ProcessPDLValue<TypeRange> : public ProcessBuiltinPDLValue<TypeRange> {};
-template <>
-struct ProcessPDLValue<ValueTypeRange<OperandRange>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ValueTypeRange<OperandRange> types) {
-    results.push_back(types);
-  }
-};
-template <>
-struct ProcessPDLValue<ValueTypeRange<ResultRange>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ValueTypeRange<ResultRange> types) {
-    results.push_back(types);
-  }
-};
-template <unsigned N>
-struct ProcessPDLValue<SmallVector<Type, N>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              SmallVector<Type, N> values) {
-    results.push_back(TypeRange(values));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Value
-
-template <>
-struct ProcessPDLValue<Value> : public ProcessBuiltinPDLValue<Value> {};
-
-//===----------------------------------------------------------------------===//
-// ValueRange
-
-template <>
-struct ProcessPDLValue<ValueRange> : public ProcessBuiltinPDLValue<ValueRange> {
-};
-template <>
-struct ProcessPDLValue<OperandRange> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              OperandRange values) {
-    results.push_back(values);
-  }
-};
-template <>
-struct ProcessPDLValue<ResultRange> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ResultRange values) {
-    results.push_back(values);
-  }
-};
-template <unsigned N>
-struct ProcessPDLValue<SmallVector<Value, N>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              SmallVector<Value, N> values) {
-    results.push_back(ValueRange(values));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Argument Handling
-//===----------------------------------------------------------------------===//
-
-/// Validate the given PDLValues match the constraints defined by the argument
-/// types of the given function. In the case of failure, a match failure
-/// diagnostic is emitted.
-/// FIXME: This should be completely removed in favor of `assertArgs`, but PDL
-/// does not currently preserve Constraint application ordering.
-template <typename PDLFnT, std::size_t... I>
-LogicalResult verifyAsArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
-                           std::index_sequence<I...>) {
-  using FnTraitsT = llvm::function_traits<PDLFnT>;
-
-  auto errorFn = [&](const Twine &msg) {
-    return rewriter.notifyMatchFailure(rewriter.getUnknownLoc(), msg);
-  };
-  return success(
-      (succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                     verifyAsArg(errorFn, values[I], I)) &&
-       ...));
-}
-
-/// Assert that the given PDLValues match the constraints defined by the
-/// arguments of the given function. In the case of failure, a fatal error
-/// is emitted.
-template <typename PDLFnT, std::size_t... I>
-void assertArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
-                std::index_sequence<I...>) {
-  // We only want to do verification in debug builds, same as with `assert`.
-#if LLVM_ENABLE_ABI_BREAKING_CHECKS
-  using FnTraitsT = llvm::function_traits<PDLFnT>;
-  auto errorFn = [&](const Twine &msg) -> LogicalResult {
-    llvm::report_fatal_error(msg);
-  };
-  (void)errorFn;
-  assert((succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                        verifyAsArg(errorFn, values[I], I)) &&
-          ...));
-#endif
-  (void)values;
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Results Handling
-//===----------------------------------------------------------------------===//
-
-/// Store a single result within the result list.
-template <typename T>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results, T &&value) {
-  ProcessPDLValue<T>::processAsResult(rewriter, results,
-                                      std::forward<T>(value));
-  return success();
-}
-
-/// Store a std::pair<> as individual results within the result list.
-template <typename T1, typename T2>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    std::pair<T1, T2> &&pair) {
-  if (failed(processResults(rewriter, results, std::move(pair.first))) ||
-      failed(processResults(rewriter, results, std::move(pair.second))))
-    return failure();
-  return success();
-}
-
-/// Store a std::tuple<> as individual results within the result list.
-template <typename... Ts>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    std::tuple<Ts...> &&tuple) {
-  auto applyFn = [&](auto &&...args) {
-    return (succeeded(processResults(rewriter, results, std::move(args))) &&
-            ...);
-  };
-  return success(std::apply(applyFn, std::move(tuple)));
-}
-
-/// Handle LogicalResult propagation.
-inline LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    LogicalResult &&result) {
-  return result;
-}
-template <typename T>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    FailureOr<T> &&result) {
-  if (failed(result))
-    return failure();
-  return processResults(rewriter, results, std::move(*result));
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Constraint Builder
-//===----------------------------------------------------------------------===//
-
-/// Process the arguments of a native constraint and invoke it.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-typename FnTraitsT::result_t
-processArgsAndInvokeConstraint(PDLFnT &fn, PatternRewriter &rewriter,
-                               ArrayRef<PDLValue> values,
-                               std::index_sequence<I...>) {
-  return fn(
-      rewriter,
-      (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
-          values[I]))...);
-}
-
-/// Build a constraint function from the given function `ConstraintFnT`. This
-/// allows for enabling the user to define simpler, more direct constraint
-/// functions without needing to handle the low-level PDL goop.
-///
-/// If the constraint function is already in the correct form, we just forward
-/// it directly.
-template <typename ConstraintFnT>
-std::enable_if_t<
-    std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
-    PDLConstraintFunction>
-buildConstraintFn(ConstraintFnT &&constraintFn) {
-  return std::forward<ConstraintFnT>(constraintFn);
-}
-/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
-/// we desire.
-template <typename ConstraintFnT>
-std::enable_if_t<
-    !std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
-    PDLConstraintFunction>
-buildConstraintFn(ConstraintFnT &&constraintFn) {
-  return [constraintFn = std::forward<ConstraintFnT>(constraintFn)](
-             PatternRewriter &rewriter,
-             ArrayRef<PDLValue> values) -> LogicalResult {
-    auto argIndices = std::make_index_sequence<
-        llvm::function_traits<ConstraintFnT>::num_args - 1>();
-    if (failed(verifyAsArgs<ConstraintFnT>(rewriter, values, argIndices)))
-      return failure();
-    return processArgsAndInvokeConstraint(constraintFn, rewriter, values,
-                                          argIndices);
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Rewrite Builder
-//===----------------------------------------------------------------------===//
-
-/// Process the arguments of a native rewrite and invoke it.
-/// This overload handles the case of no return values.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-std::enable_if_t<std::is_same<typename FnTraitsT::result_t, void>::value,
-                 LogicalResult>
-processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
-                            PDLResultList &, ArrayRef<PDLValue> values,
-                            std::index_sequence<I...>) {
-  fn(rewriter,
-     (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
-         values[I]))...);
-  return success();
-}
-/// This overload handles the case of return values, which need to be packaged
-/// into the result list.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-std::enable_if_t<!std::is_same<typename FnTraitsT::result_t, void>::value,
-                 LogicalResult>
-processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
-                            PDLResultList &results, ArrayRef<PDLValue> values,
-                            std::index_sequence<I...>) {
-  return processResults(
-      rewriter, results,
-      fn(rewriter, (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                        processAsArg(values[I]))...));
-  (void)values;
-}
-
-/// Build a rewrite function from the given function `RewriteFnT`. This
-/// allows for enabling the user to define simpler, more direct rewrite
-/// functions without needing to handle the low-level PDL goop.
-///
-/// If the rewrite function is already in the correct form, we just forward
-/// it directly.
-template <typename RewriteFnT>
-std::enable_if_t<std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
-                 PDLRewriteFunction>
-buildRewriteFn(RewriteFnT &&rewriteFn) {
-  return std::forward<RewriteFnT>(rewriteFn);
-}
-/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
-/// we desire.
-template <typename RewriteFnT>
-std::enable_if_t<!std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
-                 PDLRewriteFunction>
-buildRewriteFn(RewriteFnT &&rewriteFn) {
-  return [rewriteFn = std::forward<RewriteFnT>(rewriteFn)](
-             PatternRewriter &rewriter, PDLResultList &results,
-             ArrayRef<PDLValue> values) {
-    auto argIndices =
-        std::make_index_sequence<llvm::function_traits<RewriteFnT>::num_args -
-                                 1>();
-    assertArgs<RewriteFnT>(rewriter, values, argIndices);
-    return processArgsAndInvokeRewrite(rewriteFn, rewriter, results, values,
-                                       argIndices);
-  };
-}
-
-} // namespace pdl_function_builder
-} // namespace detail
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-
-/// This class contains all of the necessary data for a set of PDL patterns, or
-/// pattern rewrites specified in the form of the PDL dialect. This PDL module
-/// contained by this pattern may contain any number of `pdl.pattern`
-/// operations.
-class PDLPatternModule {
-public:
-  PDLPatternModule() = default;
-
-  /// Construct a PDL pattern with the given module and configurations.
-  PDLPatternModule(OwningOpRef<ModuleOp> module)
-      : pdlModule(std::move(module)) {}
-  template <typename... ConfigsT>
-  PDLPatternModule(OwningOpRef<ModuleOp> module, ConfigsT &&...patternConfigs)
-      : PDLPatternModule(std::move(module)) {
-    auto configSet = std::make_unique<PDLPatternConfigSet>(
-        std::forward<ConfigsT>(patternConfigs)...);
-    attachConfigToPatterns(*pdlModule, *configSet);
-    configs.emplace_back(std::move(configSet));
-  }
-
-  /// Merge the state in `other` into this pattern module.
-  void mergeIn(PDLPatternModule &&other);
-
-  /// Return the internal PDL module of this pattern.
-  ModuleOp getModule() { return pdlModule.get(); }
-
-  /// Return the MLIR context of this pattern.
-  MLIRContext *getContext() { return getModule()->getContext(); }
-
-  //===--------------------------------------------------------------------===//
-  // Function Registry
-
-  /// Register a constraint function with PDL. A constraint function may be
-  /// specified in one of two ways:
-  ///
-  ///   * `LogicalResult (PatternRewriter &, ArrayRef<PDLValue>)`
-  ///
-  ///   In this overload the arguments of the constraint function are passed via
-  ///   the low-level PDLValue form.
-  ///
-  ///   * `LogicalResult (PatternRewriter &, ValueTs... values)`
-  ///
-  ///   In this form the arguments of the constraint function are passed via the
-  ///   expected high level C++ type. In this form, the framework will
-  ///   automatically unwrap PDLValues and convert them to the expected ValueTs.
-  ///   For example, if the constraint function accepts a `Operation *`, the
-  ///   framework will automatically cast the input PDLValue. In the case of a
-  ///   `StringRef`, the framework will automatically unwrap the argument as a
-  ///   StringAttr and pass the underlying string value. To see the full list of
-  ///   supported types, or to see how to add handling for custom types, view
-  ///   the definition of `ProcessPDLValue` above.
-  void registerConstraintFunction(StringRef name,
-                                  PDLConstraintFunction constraintFn);
-  template <typename ConstraintFnT>
-  void registerConstraintFunction(StringRef name,
-                                  ConstraintFnT &&constraintFn) {
-    registerConstraintFunction(name,
-                               detail::pdl_function_builder::buildConstraintFn(
-                                   std::forward<ConstraintFnT>(constraintFn)));
-  }
-
-  /// Register a rewrite function with PDL. A rewrite function may be specified
-  /// in one of two ways:
-  ///
-  ///   * `void (PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)`
-  ///
-  ///   In this overload the arguments of the constraint function are passed via
-  ///   the low-level PDLValue form, and the results are manually appended to
-  ///   the given result list.
-  ///
-  ///   * `ResultT (PatternRewriter &, ValueTs... values)`
-  ///
-  ///   In this form the arguments and result of the rewrite function are passed
-  ///   via the expected high level C++ type. In this form, the framework will
-  ///   automatically unwrap the PDLValues arguments and convert them to the
-  ///   expected ValueTs. It will also automatically handle the processing and
-  ///   packaging of the result value to the result list. For example, if the
-  ///   rewrite function takes a `Operation *`, the framework will automatically
-  ///   cast the input PDLValue. In the case of a `StringRef`, the framework
-  ///   will automatically unwrap the argument as a StringAttr and pass the
-  ///   underlying string value. In the reverse case, if the rewrite returns a
-  ///   StringRef or std::string, it will automatically package this as a
-  ///   StringAttr and append it to the result list. To see the full list of
-  ///   supported types, or to see how to add handling for custom types, view
-  ///   the definition of `ProcessPDLValue` above.
-  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn);
-  template <typename RewriteFnT>
-  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {
-    registerRewriteFunction(name, detail::pdl_function_builder::buildRewriteFn(
-                                      std::forward<RewriteFnT>(rewriteFn)));
-  }
-
-  /// Return the set of the registered constraint functions.
-  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
-    return constraintFunctions;
-  }
-  llvm::StringMap<PDLConstraintFunction> takeConstraintFunctions() {
-    return constraintFunctions;
-  }
-  /// Return the set of the registered rewrite functions.
-  const llvm::StringMap<PDLRewriteFunction> &getRewriteFunctions() const {
-    return rewriteFunctions;
-  }
-  llvm::StringMap<PDLRewriteFunction> takeRewriteFunctions() {
-    return rewriteFunctions;
-  }
-
-  /// Return the set of the registered pattern configs.
-  SmallVector<std::unique_ptr<PDLPatternConfigSet>> takeConfigs() {
-    return std::move(configs);
-  }
-  DenseMap<Operation *, PDLPatternConfigSet *> takeConfigMap() {
-    return std::move(configMap);
-  }
-
-  /// Clear out the patterns and functions within this module.
-  void clear() {
-    pdlModule = nullptr;
-    constraintFunctions.clear();
-    rewriteFunctions.clear();
-  }
-
-private:
-  /// Attach the given pattern config set to the patterns defined within the
-  /// given module.
-  void attachConfigToPatterns(ModuleOp module, PDLPatternConfigSet &configSet);
-
-  /// The module containing the `pdl.pattern` operations.
-  OwningOpRef<ModuleOp> pdlModule;
-
-  /// The set of configuration sets referenced by patterns within `pdlModule`.
-  SmallVector<std::unique_ptr<PDLPatternConfigSet>> configs;
-  DenseMap<Operation *, PDLPatternConfigSet *> configMap;
-
-  /// The external functions referenced from within the PDL module.
-  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
-  llvm::StringMap<PDLRewriteFunction> rewriteFunctions;
-};
-} // namespace mlir
-
-#else
-
-namespace mlir {
-// Stubs for when PDL in pattern rewrites is not enabled.
-
-class PDLValue {
-public:
-  template <typename T>
-  T dyn_cast() const {
-    return nullptr;
-  }
-};
-class PDLResultList {};
-using PDLConstraintFunction =
-    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
-using PDLRewriteFunction = std::function<LogicalResult(
-    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
-
-class PDLPatternModule {
-public:
-  PDLPatternModule() = default;
-
-  PDLPatternModule(OwningOpRef<ModuleOp> /*module*/) {}
-  MLIRContext *getContext() {
-    llvm_unreachable("Error: PDL for rewrites when PDL is not enabled");
-  }
-  void mergeIn(PDLPatternModule &&other) {}
-  void clear() {}
-  template <typename ConstraintFnT>
-  void registerConstraintFunction(StringRef name,
-                                  ConstraintFnT &&constraintFn) {}
-  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn) {}
-  template <typename RewriteFnT>
-  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {}
-  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
-    return constraintFunctions;
-  }
-
-private:
-  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
-};
-
-} // namespace mlir
-#endif
-
-#endif // MLIR_IR_PDLPATTERNMATCH_H
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 9b4fa65bff49e..6625ef553eba2 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -735,12 +735,932 @@ class PatternRewriter : public RewriterBase {
   virtual bool canRecoverFromRewriteFailure() const { return false; }
 };
 
-} // namespace mlir
+//===----------------------------------------------------------------------===//
+// PDL Patterns
+//===----------------------------------------------------------------------===//
 
-// Optionally expose PDL pattern matching methods.
-#include "PDLPatternMatch.h.inc"
+//===----------------------------------------------------------------------===//
+// PDLValue
 
-namespace mlir {
+/// Storage type of byte-code interpreter values. These are passed to constraint
+/// functions as arguments.
+class PDLValue {
+public:
+  /// The underlying kind of a PDL value.
+  enum class Kind { Attribute, Operation, Type, TypeRange, Value, ValueRange };
+
+  /// Construct a new PDL value.
+  PDLValue(const PDLValue &other) = default;
+  PDLValue(std::nullptr_t = nullptr) {}
+  PDLValue(Attribute value)
+      : value(value.getAsOpaquePointer()), kind(Kind::Attribute) {}
+  PDLValue(Operation *value) : value(value), kind(Kind::Operation) {}
+  PDLValue(Type value) : value(value.getAsOpaquePointer()), kind(Kind::Type) {}
+  PDLValue(TypeRange *value) : value(value), kind(Kind::TypeRange) {}
+  PDLValue(Value value)
+      : value(value.getAsOpaquePointer()), kind(Kind::Value) {}
+  PDLValue(ValueRange *value) : value(value), kind(Kind::ValueRange) {}
+
+  /// Returns true if the type of the held value is `T`.
+  template <typename T>
+  bool isa() const {
+    assert(value && "isa<> used on a null value");
+    return kind == getKindOf<T>();
+  }
+
+  /// Attempt to dynamically cast this value to type `T`, returns null if this
+  /// value is not an instance of `T`.
+  template <typename T,
+            typename ResultT = std::conditional_t<
+                std::is_convertible<T, bool>::value, T, std::optional<T>>>
+  ResultT dyn_cast() const {
+    return isa<T>() ? castImpl<T>() : ResultT();
+  }
+
+  /// Cast this value to type `T`, asserts if this value is not an instance of
+  /// `T`.
+  template <typename T>
+  T cast() const {
+    assert(isa<T>() && "expected value to be of type `T`");
+    return castImpl<T>();
+  }
+
+  /// Get an opaque pointer to the value.
+  const void *getAsOpaquePointer() const { return value; }
+
+  /// Return if this value is null or not.
+  explicit operator bool() const { return value; }
+
+  /// Return the kind of this value.
+  Kind getKind() const { return kind; }
+
+  /// Print this value to the provided output stream.
+  void print(raw_ostream &os) const;
+
+  /// Print the specified value kind to an output stream.
+  static void print(raw_ostream &os, Kind kind);
+
+private:
+  /// Find the index of a given type in a range of other types.
+  template <typename...>
+  struct index_of_t;
+  template <typename T, typename... R>
+  struct index_of_t<T, T, R...> : std::integral_constant<size_t, 0> {};
+  template <typename T, typename F, typename... R>
+  struct index_of_t<T, F, R...>
+      : std::integral_constant<size_t, 1 + index_of_t<T, R...>::value> {};
+
+  /// Return the kind used for the given T.
+  template <typename T>
+  static Kind getKindOf() {
+    return static_cast<Kind>(index_of_t<T, Attribute, Operation *, Type,
+                                        TypeRange, Value, ValueRange>::value);
+  }
+
+  /// The internal implementation of `cast`, that returns the underlying value
+  /// as the given type `T`.
+  template <typename T>
+  std::enable_if_t<llvm::is_one_of<T, Attribute, Type, Value>::value, T>
+  castImpl() const {
+    return T::getFromOpaquePointer(value);
+  }
+  template <typename T>
+  std::enable_if_t<llvm::is_one_of<T, TypeRange, ValueRange>::value, T>
+  castImpl() const {
+    return *reinterpret_cast<T *>(const_cast<void *>(value));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_pointer<T>::value, T> castImpl() const {
+    return reinterpret_cast<T>(const_cast<void *>(value));
+  }
+
+  /// The internal opaque representation of a PDLValue.
+  const void *value{nullptr};
+  /// The kind of the opaque value.
+  Kind kind{Kind::Attribute};
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, PDLValue value) {
+  value.print(os);
+  return os;
+}
+
+inline raw_ostream &operator<<(raw_ostream &os, PDLValue::Kind kind) {
+  PDLValue::print(os, kind);
+  return os;
+}
+
+//===----------------------------------------------------------------------===//
+// PDLResultList
+
+/// The class represents a list of PDL results, returned by a native rewrite
+/// method. It provides the mechanism with which to pass PDLValues back to the
+/// PDL bytecode.
+class PDLResultList {
+public:
+  /// Push a new Attribute value onto the result list.
+  void push_back(Attribute value) { results.push_back(value); }
+
+  /// Push a new Operation onto the result list.
+  void push_back(Operation *value) { results.push_back(value); }
+
+  /// Push a new Type onto the result list.
+  void push_back(Type value) { results.push_back(value); }
+
+  /// Push a new TypeRange onto the result list.
+  void push_back(TypeRange value) {
+    // The lifetime of a TypeRange can't be guaranteed, so we'll need to
+    // allocate a storage for it.
+    llvm::OwningArrayRef<Type> storage(value.size());
+    llvm::copy(value, storage.begin());
+    allocatedTypeRanges.emplace_back(std::move(storage));
+    typeRanges.push_back(allocatedTypeRanges.back());
+    results.push_back(&typeRanges.back());
+  }
+  void push_back(ValueTypeRange<OperandRange> value) {
+    typeRanges.push_back(value);
+    results.push_back(&typeRanges.back());
+  }
+  void push_back(ValueTypeRange<ResultRange> value) {
+    typeRanges.push_back(value);
+    results.push_back(&typeRanges.back());
+  }
+
+  /// Push a new Value onto the result list.
+  void push_back(Value value) { results.push_back(value); }
+
+  /// Push a new ValueRange onto the result list.
+  void push_back(ValueRange value) {
+    // The lifetime of a ValueRange can't be guaranteed, so we'll need to
+    // allocate a storage for it.
+    llvm::OwningArrayRef<Value> storage(value.size());
+    llvm::copy(value, storage.begin());
+    allocatedValueRanges.emplace_back(std::move(storage));
+    valueRanges.push_back(allocatedValueRanges.back());
+    results.push_back(&valueRanges.back());
+  }
+  void push_back(OperandRange value) {
+    valueRanges.push_back(value);
+    results.push_back(&valueRanges.back());
+  }
+  void push_back(ResultRange value) {
+    valueRanges.push_back(value);
+    results.push_back(&valueRanges.back());
+  }
+
+protected:
+  /// Create a new result list with the expected number of results.
+  PDLResultList(unsigned maxNumResults) {
+    // For now just reserve enough space for all of the results. We could do
+    // separate counts per range type, but it isn't really worth it unless there
+    // are a "large" number of results.
+    typeRanges.reserve(maxNumResults);
+    valueRanges.reserve(maxNumResults);
+  }
+
+  /// The PDL results held by this list.
+  SmallVector<PDLValue> results;
+  /// Memory used to store ranges held by the list.
+  SmallVector<TypeRange> typeRanges;
+  SmallVector<ValueRange> valueRanges;
+  /// Memory allocated to store ranges in the result list whose lifetime was
+  /// generated in the native function.
+  SmallVector<llvm::OwningArrayRef<Type>> allocatedTypeRanges;
+  SmallVector<llvm::OwningArrayRef<Value>> allocatedValueRanges;
+};
+
+//===----------------------------------------------------------------------===//
+// PDLPatternConfig
+
+/// An individual configuration for a pattern, which can be accessed by native
+/// functions via the PDLPatternConfigSet. This allows for injecting additional
+/// configuration into PDL patterns that is specific to certain compilation
+/// flows.
+class PDLPatternConfig {
+public:
+  virtual ~PDLPatternConfig() = default;
+
+  /// Hooks that are invoked at the beginning and end of a rewrite of a matched
+  /// pattern. These can be used to setup any specific state necessary for the
+  /// rewrite.
+  virtual void notifyRewriteBegin(PatternRewriter &rewriter) {}
+  virtual void notifyRewriteEnd(PatternRewriter &rewriter) {}
+
+  /// Return the TypeID that represents this configuration.
+  TypeID getTypeID() const { return id; }
+
+protected:
+  PDLPatternConfig(TypeID id) : id(id) {}
+
+private:
+  TypeID id;
+};
+
+/// This class provides a base class for users implementing a type of pattern
+/// configuration.
+template <typename T>
+class PDLPatternConfigBase : public PDLPatternConfig {
+public:
+  /// Support LLVM style casting.
+  static bool classof(const PDLPatternConfig *config) {
+    return config->getTypeID() == getConfigID();
+  }
+
+  /// Return the type id used for this configuration.
+  static TypeID getConfigID() { return TypeID::get<T>(); }
+
+protected:
+  PDLPatternConfigBase() : PDLPatternConfig(getConfigID()) {}
+};
+
+/// This class contains a set of configurations for a specific pattern.
+/// Configurations are uniqued by TypeID, meaning that only one configuration of
+/// each type is allowed.
+class PDLPatternConfigSet {
+public:
+  PDLPatternConfigSet() = default;
+
+  /// Construct a set with the given configurations.
+  template <typename... ConfigsT>
+  PDLPatternConfigSet(ConfigsT &&...configs) {
+    (addConfig(std::forward<ConfigsT>(configs)), ...);
+  }
+
+  /// Get the configuration defined by the given type. Asserts that the
+  /// configuration of the provided type exists.
+  template <typename T>
+  const T &get() const {
+    const T *config = tryGet<T>();
+    assert(config && "configuration not found");
+    return *config;
+  }
+
+  /// Get the configuration defined by the given type, returns nullptr if the
+  /// configuration does not exist.
+  template <typename T>
+  const T *tryGet() const {
+    for (const auto &configIt : configs)
+      if (const T *config = dyn_cast<T>(configIt.get()))
+        return config;
+    return nullptr;
+  }
+
+  /// Notify the configurations within this set at the beginning or end of a
+  /// rewrite of a matched pattern.
+  void notifyRewriteBegin(PatternRewriter &rewriter) {
+    for (const auto &config : configs)
+      config->notifyRewriteBegin(rewriter);
+  }
+  void notifyRewriteEnd(PatternRewriter &rewriter) {
+    for (const auto &config : configs)
+      config->notifyRewriteEnd(rewriter);
+  }
+
+protected:
+  /// Add a configuration to the set.
+  template <typename T>
+  void addConfig(T &&config) {
+    assert(!tryGet<std::decay_t<T>>() && "configuration already exists");
+    configs.emplace_back(
+        std::make_unique<std::decay_t<T>>(std::forward<T>(config)));
+  }
+
+  /// The set of configurations for this pattern. This uses a vector instead of
+  /// a map with the expectation that the number of configurations per set is
+  /// small (<= 1).
+  SmallVector<std::unique_ptr<PDLPatternConfig>> configs;
+};
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+
+/// A generic PDL pattern constraint function. This function applies a
+/// constraint to a given set of opaque PDLValue entities. Returns success if
+/// the constraint successfully held, failure otherwise.
+using PDLConstraintFunction =
+    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
+/// A native PDL rewrite function. This function performs a rewrite on the
+/// given set of values. Any results from this rewrite that should be passed
+/// back to PDL should be added to the provided result list. This method is only
+/// invoked when the corresponding match was successful. Returns failure if an
+/// invariant of the rewrite was broken (certain rewriters may recover from
+/// partial pattern application).
+using PDLRewriteFunction = std::function<LogicalResult(
+    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
+
+namespace detail {
+namespace pdl_function_builder {
+/// A utility variable that always resolves to false. This is useful for static
+/// asserts that are always false, but only should fire in certain templated
+/// constructs. For example, if a templated function should never be called, the
+/// function could be defined as:
+///
+/// template <typename T>
+/// void foo() {
+///  static_assert(always_false<T>, "This function should never be called");
+/// }
+///
+template <class... T>
+constexpr bool always_false = false;
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Type Processing
+//===----------------------------------------------------------------------===//
+
+/// This struct provides a convenient way to determine how to process a given
+/// type as either a PDL parameter, or a result value. This allows for
+/// supporting complex types in constraint and rewrite functions, without
+/// requiring the user to hand-write the necessary glue code themselves.
+/// Specializations of this class should implement the following methods to
+/// enable support as a PDL argument or result type:
+///
+///   static LogicalResult verifyAsArg(
+///     function_ref<LogicalResult(const Twine &)> errorFn, PDLValue pdlValue,
+///     size_t argIdx);
+///
+///     * This method verifies that the given PDLValue is valid for use as a
+///       value of `T`.
+///
+///   static T processAsArg(PDLValue pdlValue);
+///
+///     *  This method processes the given PDLValue as a value of `T`.
+///
+///   static void processAsResult(PatternRewriter &, PDLResultList &results,
+///                               const T &value);
+///
+///     *  This method processes the given value of `T` as the result of a
+///        function invocation. The method should package the value into an
+///        appropriate form and append it to the given result list.
+///
+/// If the type `T` is based on a higher order value, consider using
+/// `ProcessPDLValueBasedOn` as a base class of the specialization to simplify
+/// the implementation.
+///
+template <typename T, typename Enable = void>
+struct ProcessPDLValue;
+
+/// This struct provides a simplified model for processing types that are based
+/// on another type, e.g. APInt is based on the handling for IntegerAttr. This
+/// allows for building the necessary processing functions on top of the base
+/// value instead of a PDLValue. Derived users should implement the following
+/// (which subsume the ProcessPDLValue variants):
+///
+///   static LogicalResult verifyAsArg(
+///     function_ref<LogicalResult(const Twine &)> errorFn,
+///     const BaseT &baseValue, size_t argIdx);
+///
+///     * This method verifies that the given PDLValue is valid for use as a
+///       value of `T`.
+///
+///   static T processAsArg(BaseT baseValue);
+///
+///     *  This method processes the given base value as a value of `T`.
+///
+template <typename T, typename BaseT>
+struct ProcessPDLValueBasedOn {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              PDLValue pdlValue, size_t argIdx) {
+    // Verify the base class before continuing.
+    if (failed(ProcessPDLValue<BaseT>::verifyAsArg(errorFn, pdlValue, argIdx)))
+      return failure();
+    return ProcessPDLValue<T>::verifyAsArg(
+        errorFn, ProcessPDLValue<BaseT>::processAsArg(pdlValue), argIdx);
+  }
+  static T processAsArg(PDLValue pdlValue) {
+    return ProcessPDLValue<T>::processAsArg(
+        ProcessPDLValue<BaseT>::processAsArg(pdlValue));
+  }
+
+  /// Explicitly add the expected parent API to ensure the parent class
+  /// implements the necessary API (and doesn't implicitly inherit it from
+  /// somewhere else).
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn, BaseT value,
+              size_t argIdx) {
+    return success();
+  }
+  static T processAsArg(BaseT baseValue);
+};
+
+/// This struct provides a simplified model for processing types that have
+/// "builtin" PDLValue support:
+///   * Attribute, Operation *, Type, TypeRange, ValueRange
+template <typename T>
+struct ProcessBuiltinPDLValue {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              PDLValue pdlValue, size_t argIdx) {
+    if (pdlValue)
+      return success();
+    return errorFn("expected a non-null value for argument " + Twine(argIdx) +
+                   " of type: " + llvm::getTypeName<T>());
+  }
+
+  static T processAsArg(PDLValue pdlValue) { return pdlValue.cast<T>(); }
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              T value) {
+    results.push_back(value);
+  }
+};
+
+/// This struct provides a simplified model for processing types that inherit
+/// from builtin PDLValue types. For example, derived attributes like
+/// IntegerAttr, derived types like IntegerType, derived operations like
+/// ModuleOp, Interfaces, etc.
+template <typename T, typename BaseT>
+struct ProcessDerivedPDLValue : public ProcessPDLValueBasedOn<T, BaseT> {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              BaseT baseValue, size_t argIdx) {
+    return TypeSwitch<BaseT, LogicalResult>(baseValue)
+        .Case([&](T) { return success(); })
+        .Default([&](BaseT) {
+          return errorFn("expected argument " + Twine(argIdx) +
+                         " to be of type: " + llvm::getTypeName<T>());
+        });
+  }
+  using ProcessPDLValueBasedOn<T, BaseT>::verifyAsArg;
+
+  static T processAsArg(BaseT baseValue) {
+    return baseValue.template cast<T>();
+  }
+  using ProcessPDLValueBasedOn<T, BaseT>::processAsArg;
+
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              T value) {
+    results.push_back(value);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Attribute
+
+template <>
+struct ProcessPDLValue<Attribute> : public ProcessBuiltinPDLValue<Attribute> {};
+template <typename T>
+struct ProcessPDLValue<T,
+                       std::enable_if_t<std::is_base_of<Attribute, T>::value>>
+    : public ProcessDerivedPDLValue<T, Attribute> {};
+
+/// Handling for various Attribute value types.
+template <>
+struct ProcessPDLValue<StringRef>
+    : public ProcessPDLValueBasedOn<StringRef, StringAttr> {
+  static StringRef processAsArg(StringAttr value) { return value.getValue(); }
+  using ProcessPDLValueBasedOn<StringRef, StringAttr>::processAsArg;
+
+  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
+                              StringRef value) {
+    results.push_back(rewriter.getStringAttr(value));
+  }
+};
+template <>
+struct ProcessPDLValue<std::string>
+    : public ProcessPDLValueBasedOn<std::string, StringAttr> {
+  template <typename T>
+  static std::string processAsArg(T value) {
+    static_assert(always_false<T>,
+                  "`std::string` arguments require a string copy, use "
+                  "`StringRef` for string-like arguments instead");
+    return {};
+  }
+  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
+                              StringRef value) {
+    results.push_back(rewriter.getStringAttr(value));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Operation
+
+template <>
+struct ProcessPDLValue<Operation *>
+    : public ProcessBuiltinPDLValue<Operation *> {};
+template <typename T>
+struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<OpState, T>::value>>
+    : public ProcessDerivedPDLValue<T, Operation *> {
+  static T processAsArg(Operation *value) { return cast<T>(value); }
+};
+
+//===----------------------------------------------------------------------===//
+// Type
+
+template <>
+struct ProcessPDLValue<Type> : public ProcessBuiltinPDLValue<Type> {};
+template <typename T>
+struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<Type, T>::value>>
+    : public ProcessDerivedPDLValue<T, Type> {};
+
+//===----------------------------------------------------------------------===//
+// TypeRange
+
+template <>
+struct ProcessPDLValue<TypeRange> : public ProcessBuiltinPDLValue<TypeRange> {};
+template <>
+struct ProcessPDLValue<ValueTypeRange<OperandRange>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ValueTypeRange<OperandRange> types) {
+    results.push_back(types);
+  }
+};
+template <>
+struct ProcessPDLValue<ValueTypeRange<ResultRange>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ValueTypeRange<ResultRange> types) {
+    results.push_back(types);
+  }
+};
+template <unsigned N>
+struct ProcessPDLValue<SmallVector<Type, N>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              SmallVector<Type, N> values) {
+    results.push_back(TypeRange(values));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Value
+
+template <>
+struct ProcessPDLValue<Value> : public ProcessBuiltinPDLValue<Value> {};
+
+//===----------------------------------------------------------------------===//
+// ValueRange
+
+template <>
+struct ProcessPDLValue<ValueRange> : public ProcessBuiltinPDLValue<ValueRange> {
+};
+template <>
+struct ProcessPDLValue<OperandRange> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              OperandRange values) {
+    results.push_back(values);
+  }
+};
+template <>
+struct ProcessPDLValue<ResultRange> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ResultRange values) {
+    results.push_back(values);
+  }
+};
+template <unsigned N>
+struct ProcessPDLValue<SmallVector<Value, N>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              SmallVector<Value, N> values) {
+    results.push_back(ValueRange(values));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Argument Handling
+//===----------------------------------------------------------------------===//
+
+/// Validate the given PDLValues match the constraints defined by the argument
+/// types of the given function. In the case of failure, a match failure
+/// diagnostic is emitted.
+/// FIXME: This should be completely removed in favor of `assertArgs`, but PDL
+/// does not currently preserve Constraint application ordering.
+template <typename PDLFnT, std::size_t... I>
+LogicalResult verifyAsArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
+                           std::index_sequence<I...>) {
+  using FnTraitsT = llvm::function_traits<PDLFnT>;
+
+  auto errorFn = [&](const Twine &msg) {
+    return rewriter.notifyMatchFailure(rewriter.getUnknownLoc(), msg);
+  };
+  return success(
+      (succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                     verifyAsArg(errorFn, values[I], I)) &&
+       ...));
+}
+
+/// Assert that the given PDLValues match the constraints defined by the
+/// arguments of the given function. In the case of failure, a fatal error
+/// is emitted.
+template <typename PDLFnT, std::size_t... I>
+void assertArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
+                std::index_sequence<I...>) {
+  // We only want to do verification in debug builds, same as with `assert`.
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  using FnTraitsT = llvm::function_traits<PDLFnT>;
+  auto errorFn = [&](const Twine &msg) -> LogicalResult {
+    llvm::report_fatal_error(msg);
+  };
+  (void)errorFn;
+  assert((succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                        verifyAsArg(errorFn, values[I], I)) &&
+          ...));
+#endif
+  (void)values;
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Results Handling
+//===----------------------------------------------------------------------===//
+
+/// Store a single result within the result list.
+template <typename T>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results, T &&value) {
+  ProcessPDLValue<T>::processAsResult(rewriter, results,
+                                      std::forward<T>(value));
+  return success();
+}
+
+/// Store a std::pair<> as individual results within the result list.
+template <typename T1, typename T2>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    std::pair<T1, T2> &&pair) {
+  if (failed(processResults(rewriter, results, std::move(pair.first))) ||
+      failed(processResults(rewriter, results, std::move(pair.second))))
+    return failure();
+  return success();
+}
+
+/// Store a std::tuple<> as individual results within the result list.
+template <typename... Ts>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    std::tuple<Ts...> &&tuple) {
+  auto applyFn = [&](auto &&...args) {
+    return (succeeded(processResults(rewriter, results, std::move(args))) &&
+            ...);
+  };
+  return success(std::apply(applyFn, std::move(tuple)));
+}
+
+/// Handle LogicalResult propagation.
+inline LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    LogicalResult &&result) {
+  return result;
+}
+template <typename T>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    FailureOr<T> &&result) {
+  if (failed(result))
+    return failure();
+  return processResults(rewriter, results, std::move(*result));
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Constraint Builder
+//===----------------------------------------------------------------------===//
+
+/// Process the arguments of a native constraint and invoke it.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+typename FnTraitsT::result_t
+processArgsAndInvokeConstraint(PDLFnT &fn, PatternRewriter &rewriter,
+                               ArrayRef<PDLValue> values,
+                               std::index_sequence<I...>) {
+  return fn(
+      rewriter,
+      (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
+          values[I]))...);
+}
+
+/// Build a constraint function from the given function `ConstraintFnT`. This
+/// allows for enabling the user to define simpler, more direct constraint
+/// functions without needing to handle the low-level PDL goop.
+///
+/// If the constraint function is already in the correct form, we just forward
+/// it directly.
+template <typename ConstraintFnT>
+std::enable_if_t<
+    std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
+    PDLConstraintFunction>
+buildConstraintFn(ConstraintFnT &&constraintFn) {
+  return std::forward<ConstraintFnT>(constraintFn);
+}
+/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
+/// we desire.
+template <typename ConstraintFnT>
+std::enable_if_t<
+    !std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
+    PDLConstraintFunction>
+buildConstraintFn(ConstraintFnT &&constraintFn) {
+  return [constraintFn = std::forward<ConstraintFnT>(constraintFn)](
+             PatternRewriter &rewriter,
+             ArrayRef<PDLValue> values) -> LogicalResult {
+    auto argIndices = std::make_index_sequence<
+        llvm::function_traits<ConstraintFnT>::num_args - 1>();
+    if (failed(verifyAsArgs<ConstraintFnT>(rewriter, values, argIndices)))
+      return failure();
+    return processArgsAndInvokeConstraint(constraintFn, rewriter, values,
+                                          argIndices);
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Rewrite Builder
+//===----------------------------------------------------------------------===//
+
+/// Process the arguments of a native rewrite and invoke it.
+/// This overload handles the case of no return values.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+std::enable_if_t<std::is_same<typename FnTraitsT::result_t, void>::value,
+                 LogicalResult>
+processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
+                            PDLResultList &, ArrayRef<PDLValue> values,
+                            std::index_sequence<I...>) {
+  fn(rewriter,
+     (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
+         values[I]))...);
+  return success();
+}
+/// This overload handles the case of return values, which need to be packaged
+/// into the result list.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+std::enable_if_t<!std::is_same<typename FnTraitsT::result_t, void>::value,
+                 LogicalResult>
+processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
+                            PDLResultList &results, ArrayRef<PDLValue> values,
+                            std::index_sequence<I...>) {
+  return processResults(
+      rewriter, results,
+      fn(rewriter, (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                        processAsArg(values[I]))...));
+  (void)values;
+}
+
+/// Build a rewrite function from the given function `RewriteFnT`. This
+/// allows for enabling the user to define simpler, more direct rewrite
+/// functions without needing to handle the low-level PDL goop.
+///
+/// If the rewrite function is already in the correct form, we just forward
+/// it directly.
+template <typename RewriteFnT>
+std::enable_if_t<std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
+                 PDLRewriteFunction>
+buildRewriteFn(RewriteFnT &&rewriteFn) {
+  return std::forward<RewriteFnT>(rewriteFn);
+}
+/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
+/// we desire.
+template <typename RewriteFnT>
+std::enable_if_t<!std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
+                 PDLRewriteFunction>
+buildRewriteFn(RewriteFnT &&rewriteFn) {
+  return [rewriteFn = std::forward<RewriteFnT>(rewriteFn)](
+             PatternRewriter &rewriter, PDLResultList &results,
+             ArrayRef<PDLValue> values) {
+    auto argIndices =
+        std::make_index_sequence<llvm::function_traits<RewriteFnT>::num_args -
+                                 1>();
+    assertArgs<RewriteFnT>(rewriter, values, argIndices);
+    return processArgsAndInvokeRewrite(rewriteFn, rewriter, results, values,
+                                       argIndices);
+  };
+}
+
+} // namespace pdl_function_builder
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+
+/// This class contains all of the necessary data for a set of PDL patterns, or
+/// pattern rewrites specified in the form of the PDL dialect. This PDL module
+/// contained by this pattern may contain any number of `pdl.pattern`
+/// operations.
+class PDLPatternModule {
+public:
+  PDLPatternModule() = default;
+
+  /// Construct a PDL pattern with the given module and configurations.
+  PDLPatternModule(OwningOpRef<ModuleOp> module)
+      : pdlModule(std::move(module)) {}
+  template <typename... ConfigsT>
+  PDLPatternModule(OwningOpRef<ModuleOp> module, ConfigsT &&...patternConfigs)
+      : PDLPatternModule(std::move(module)) {
+    auto configSet = std::make_unique<PDLPatternConfigSet>(
+        std::forward<ConfigsT>(patternConfigs)...);
+    attachConfigToPatterns(*pdlModule, *configSet);
+    configs.emplace_back(std::move(configSet));
+  }
+
+  /// Merge the state in `other` into this pattern module.
+  void mergeIn(PDLPatternModule &&other);
+
+  /// Return the internal PDL module of this pattern.
+  ModuleOp getModule() { return pdlModule.get(); }
+
+  //===--------------------------------------------------------------------===//
+  // Function Registry
+
+  /// Register a constraint function with PDL. A constraint function may be
+  /// specified in one of two ways:
+  ///
+  ///   * `LogicalResult (PatternRewriter &, ArrayRef<PDLValue>)`
+  ///
+  ///   In this overload the arguments of the constraint function are passed via
+  ///   the low-level PDLValue form.
+  ///
+  ///   * `LogicalResult (PatternRewriter &, ValueTs... values)`
+  ///
+  ///   In this form the arguments of the constraint function are passed via the
+  ///   expected high level C++ type. In this form, the framework will
+  ///   automatically unwrap PDLValues and convert them to the expected ValueTs.
+  ///   For example, if the constraint function accepts a `Operation *`, the
+  ///   framework will automatically cast the input PDLValue. In the case of a
+  ///   `StringRef`, the framework will automatically unwrap the argument as a
+  ///   StringAttr and pass the underlying string value. To see the full list of
+  ///   supported types, or to see how to add handling for custom types, view
+  ///   the definition of `ProcessPDLValue` above.
+  void registerConstraintFunction(StringRef name,
+                                  PDLConstraintFunction constraintFn);
+  template <typename ConstraintFnT>
+  void registerConstraintFunction(StringRef name,
+                                  ConstraintFnT &&constraintFn) {
+    registerConstraintFunction(name,
+                               detail::pdl_function_builder::buildConstraintFn(
+                                   std::forward<ConstraintFnT>(constraintFn)));
+  }
+
+  /// Register a rewrite function with PDL. A rewrite function may be specified
+  /// in one of two ways:
+  ///
+  ///   * `void (PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)`
+  ///
+  ///   In this overload the arguments of the constraint function are passed via
+  ///   the low-level PDLValue form, and the results are manually appended to
+  ///   the given result list.
+  ///
+  ///   * `ResultT (PatternRewriter &, ValueTs... values)`
+  ///
+  ///   In this form the arguments and result of the rewrite function are passed
+  ///   via the expected high level C++ type. In this form, the framework will
+  ///   automatically unwrap the PDLValues arguments and convert them to the
+  ///   expected ValueTs. It will also automatically handle the processing and
+  ///   packaging of the result value to the result list. For example, if the
+  ///   rewrite function takes a `Operation *`, the framework will automatically
+  ///   cast the input PDLValue. In the case of a `StringRef`, the framework
+  ///   will automatically unwrap the argument as a StringAttr and pass the
+  ///   underlying string value. In the reverse case, if the rewrite returns a
+  ///   StringRef or std::string, it will automatically package this as a
+  ///   StringAttr and append it to the result list. To see the full list of
+  ///   supported types, or to see how to add handling for custom types, view
+  ///   the definition of `ProcessPDLValue` above.
+  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn);
+  template <typename RewriteFnT>
+  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {
+    registerRewriteFunction(name, detail::pdl_function_builder::buildRewriteFn(
+                                      std::forward<RewriteFnT>(rewriteFn)));
+  }
+
+  /// Return the set of the registered constraint functions.
+  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
+    return constraintFunctions;
+  }
+  llvm::StringMap<PDLConstraintFunction> takeConstraintFunctions() {
+    return constraintFunctions;
+  }
+  /// Return the set of the registered rewrite functions.
+  const llvm::StringMap<PDLRewriteFunction> &getRewriteFunctions() const {
+    return rewriteFunctions;
+  }
+  llvm::StringMap<PDLRewriteFunction> takeRewriteFunctions() {
+    return rewriteFunctions;
+  }
+
+  /// Return the set of the registered pattern configs.
+  SmallVector<std::unique_ptr<PDLPatternConfigSet>> takeConfigs() {
+    return std::move(configs);
+  }
+  DenseMap<Operation *, PDLPatternConfigSet *> takeConfigMap() {
+    return std::move(configMap);
+  }
+
+  /// Clear out the patterns and functions within this module.
+  void clear() {
+    pdlModule = nullptr;
+    constraintFunctions.clear();
+    rewriteFunctions.clear();
+  }
+
+private:
+  /// Attach the given pattern config set to the patterns defined within the
+  /// given module.
+  void attachConfigToPatterns(ModuleOp module, PDLPatternConfigSet &configSet);
+
+  /// The module containing the `pdl.pattern` operations.
+  OwningOpRef<ModuleOp> pdlModule;
+
+  /// The set of configuration sets referenced by patterns within `pdlModule`.
+  SmallVector<std::unique_ptr<PDLPatternConfigSet>> configs;
+  DenseMap<Operation *, PDLPatternConfigSet *> configMap;
+
+  /// The external functions referenced from within the PDL module.
+  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
+  llvm::StringMap<PDLRewriteFunction> rewriteFunctions;
+};
 
 //===----------------------------------------------------------------------===//
 // RewritePatternSet
@@ -759,7 +1679,8 @@ class RewritePatternSet {
     nativePatterns.emplace_back(std::move(pattern));
   }
   RewritePatternSet(PDLPatternModule &&pattern)
-      : context(pattern.getContext()), pdlPatterns(std::move(pattern)) {}
+      : context(pattern.getModule()->getContext()),
+        pdlPatterns(std::move(pattern)) {}
 
   MLIRContext *getContext() const { return context; }
 
@@ -932,7 +1853,6 @@ class RewritePatternSet {
     pattern->addDebugLabels(debugLabels);
     nativePatterns.emplace_back(std::move(pattern));
   }
-
   template <typename T, typename... Args>
   std::enable_if_t<std::is_base_of<PDLPatternModule, T>::value>
   addImpl(ArrayRef<StringRef> debugLabels, Args &&...args) {
@@ -943,9 +1863,6 @@ class RewritePatternSet {
 
   MLIRContext *const context;
   NativePatternListT nativePatterns;
-
-  // Patterns expressed with PDL. This will compile to a stub class when PDL is
-  // not enabled.
   PDLPatternModule pdlPatterns;
 };
 
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index c5725e9c85625..6de981d35c8c3 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -13,7 +13,6 @@
 #ifndef MLIR_TRANSFORMS_DIALECTCONVERSION_H_
 #define MLIR_TRANSFORMS_DIALECTCONVERSION_H_
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -1016,7 +1015,6 @@ class ConversionTarget {
   MLIRContext &ctx;
 };
 
-#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 //===----------------------------------------------------------------------===//
 // PDL Configuration
 //===----------------------------------------------------------------------===//
@@ -1046,19 +1044,6 @@ class PDLConversionConfig final
 /// Register the dialect conversion PDL functions with the given pattern set.
 void registerConversionPDLFunctions(RewritePatternSet &patterns);
 
-#else
-
-// Stubs for when PDL in rewriting is not enabled.
-
-inline void registerConversionPDLFunctions(RewritePatternSet &patterns) {}
-
-class PDLConversionConfig final {
-public:
-  PDLConversionConfig(const TypeConverter * /*converter*/) {}
-};
-
-#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
-
 //===----------------------------------------------------------------------===//
 // Op Conversion Entry Points
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
index be5eb73b91229..7f7b348b17ae6 100644
--- a/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_dialect_library(MLIRBufferizationTransformOps
   MLIRFunctionInterfaces
   MLIRLinalgDialect
   MLIRParser
+  MLIRPDLDialect
   MLIRSideEffectInterfaces
   MLIRTransformDialect
   )
diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt
index a155b7c5ecade..21bba11b85117 100644
--- a/mlir/lib/IR/CMakeLists.txt
+++ b/mlir/lib/IR/CMakeLists.txt
@@ -61,10 +61,3 @@ add_mlir_library(MLIRIR
   LINK_LIBS PUBLIC
   MLIRSupport
   )
-
-if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
-  add_subdirectory(PDL)
-  target_link_libraries(MLIRIR PUBLIC
-    MLIRIRPDLPatternMatch)
-endif()
-
diff --git a/mlir/lib/IR/PDL/CMakeLists.txt b/mlir/lib/IR/PDL/CMakeLists.txt
deleted file mode 100644
index 08b7fe36fac09..0000000000000
--- a/mlir/lib/IR/PDL/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_mlir_library(MLIRIRPDLPatternMatch
-  PDLPatternMatch.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
-)
-
diff --git a/mlir/lib/IR/PDL/PDLPatternMatch.cpp b/mlir/lib/IR/PDL/PDLPatternMatch.cpp
deleted file mode 100644
index da07cc462a5a1..0000000000000
--- a/mlir/lib/IR/PDL/PDLPatternMatch.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-//===- PDLPatternMatch.cpp - Base classes for PDL pattern match
-//------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Iterators.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/RegionKindInterface.h"
-
-using namespace mlir;
-
-//===----------------------------------------------------------------------===//
-// PDLValue
-//===----------------------------------------------------------------------===//
-
-void PDLValue::print(raw_ostream &os) const {
-  if (!value) {
-    os << "<NULL-PDLValue>";
-    return;
-  }
-  switch (kind) {
-  case Kind::Attribute:
-    os << cast<Attribute>();
-    break;
-  case Kind::Operation:
-    os << *cast<Operation *>();
-    break;
-  case Kind::Type:
-    os << cast<Type>();
-    break;
-  case Kind::TypeRange:
-    llvm::interleaveComma(cast<TypeRange>(), os);
-    break;
-  case Kind::Value:
-    os << cast<Value>();
-    break;
-  case Kind::ValueRange:
-    llvm::interleaveComma(cast<ValueRange>(), os);
-    break;
-  }
-}
-
-void PDLValue::print(raw_ostream &os, Kind kind) {
-  switch (kind) {
-  case Kind::Attribute:
-    os << "Attribute";
-    break;
-  case Kind::Operation:
-    os << "Operation";
-    break;
-  case Kind::Type:
-    os << "Type";
-    break;
-  case Kind::TypeRange:
-    os << "TypeRange";
-    break;
-  case Kind::Value:
-    os << "Value";
-    break;
-  case Kind::ValueRange:
-    os << "ValueRange";
-    break;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-//===----------------------------------------------------------------------===//
-
-void PDLPatternModule::mergeIn(PDLPatternModule &&other) {
-  // Ignore the other module if it has no patterns.
-  if (!other.pdlModule)
-    return;
-
-  // Steal the functions and config of the other module.
-  for (auto &it : other.constraintFunctions)
-    registerConstraintFunction(it.first(), std::move(it.second));
-  for (auto &it : other.rewriteFunctions)
-    registerRewriteFunction(it.first(), std::move(it.second));
-  for (auto &it : other.configs)
-    configs.emplace_back(std::move(it));
-  for (auto &it : other.configMap)
-    configMap.insert(it);
-
-  // Steal the other state if we have no patterns.
-  if (!pdlModule) {
-    pdlModule = std::move(other.pdlModule);
-    return;
-  }
-
-  // Merge the pattern operations from the other module into this one.
-  Block *block = pdlModule->getBody();
-  block->getOperations().splice(block->end(),
-                                other.pdlModule->getBody()->getOperations());
-}
-
-void PDLPatternModule::attachConfigToPatterns(ModuleOp module,
-                                              PDLPatternConfigSet &configSet) {
-  // Attach the configuration to the symbols within the module. We only add
-  // to symbols to avoid hardcoding any specific operation names here (given
-  // that we don't depend on any PDL dialect). We can't use
-  // cast<SymbolOpInterface> here because patterns may be optional symbols.
-  module->walk([&](Operation *op) {
-    if (op->hasTrait<SymbolOpInterface::Trait>())
-      configMap[op] = &configSet;
-  });
-}
-
-//===----------------------------------------------------------------------===//
-// Function Registry
-
-void PDLPatternModule::registerConstraintFunction(
-    StringRef name, PDLConstraintFunction constraintFn) {
-  // TODO: Is it possible to diagnose when `name` is already registered to
-  // a function that is not equivalent to `constraintFn`?
-  // Allow existing mappings in the case multiple patterns depend on the same
-  // constraint.
-  constraintFunctions.try_emplace(name, std::move(constraintFn));
-}
-
-void PDLPatternModule::registerRewriteFunction(StringRef name,
-                                               PDLRewriteFunction rewriteFn) {
-  // TODO: Is it possible to diagnose when `name` is already registered to
-  // a function that is not equivalent to `rewriteFn`?
-  // Allow existing mappings in the case multiple patterns depend on the same
-  // rewrite.
-  rewriteFunctions.try_emplace(name, std::move(rewriteFn));
-}
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
index 5e788cdb4897d..5e9b9b2a810a4 100644
--- a/mlir/lib/IR/PatternMatch.cpp
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Config/mlir-config.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/IR/RegionKindInterface.h"
@@ -98,6 +97,124 @@ LogicalResult RewritePattern::match(Operation *op) const {
 /// Out-of-line vtable anchor.
 void RewritePattern::anchor() {}
 
+//===----------------------------------------------------------------------===//
+// PDLValue
+//===----------------------------------------------------------------------===//
+
+void PDLValue::print(raw_ostream &os) const {
+  if (!value) {
+    os << "<NULL-PDLValue>";
+    return;
+  }
+  switch (kind) {
+  case Kind::Attribute:
+    os << cast<Attribute>();
+    break;
+  case Kind::Operation:
+    os << *cast<Operation *>();
+    break;
+  case Kind::Type:
+    os << cast<Type>();
+    break;
+  case Kind::TypeRange:
+    llvm::interleaveComma(cast<TypeRange>(), os);
+    break;
+  case Kind::Value:
+    os << cast<Value>();
+    break;
+  case Kind::ValueRange:
+    llvm::interleaveComma(cast<ValueRange>(), os);
+    break;
+  }
+}
+
+void PDLValue::print(raw_ostream &os, Kind kind) {
+  switch (kind) {
+  case Kind::Attribute:
+    os << "Attribute";
+    break;
+  case Kind::Operation:
+    os << "Operation";
+    break;
+  case Kind::Type:
+    os << "Type";
+    break;
+  case Kind::TypeRange:
+    os << "TypeRange";
+    break;
+  case Kind::Value:
+    os << "Value";
+    break;
+  case Kind::ValueRange:
+    os << "ValueRange";
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+//===----------------------------------------------------------------------===//
+
+void PDLPatternModule::mergeIn(PDLPatternModule &&other) {
+  // Ignore the other module if it has no patterns.
+  if (!other.pdlModule)
+    return;
+
+  // Steal the functions and config of the other module.
+  for (auto &it : other.constraintFunctions)
+    registerConstraintFunction(it.first(), std::move(it.second));
+  for (auto &it : other.rewriteFunctions)
+    registerRewriteFunction(it.first(), std::move(it.second));
+  for (auto &it : other.configs)
+    configs.emplace_back(std::move(it));
+  for (auto &it : other.configMap)
+    configMap.insert(it);
+
+  // Steal the other state if we have no patterns.
+  if (!pdlModule) {
+    pdlModule = std::move(other.pdlModule);
+    return;
+  }
+
+  // Merge the pattern operations from the other module into this one.
+  Block *block = pdlModule->getBody();
+  block->getOperations().splice(block->end(),
+                                other.pdlModule->getBody()->getOperations());
+}
+
+void PDLPatternModule::attachConfigToPatterns(ModuleOp module,
+                                              PDLPatternConfigSet &configSet) {
+  // Attach the configuration to the symbols within the module. We only add
+  // to symbols to avoid hardcoding any specific operation names here (given
+  // that we don't depend on any PDL dialect). We can't use
+  // cast<SymbolOpInterface> here because patterns may be optional symbols.
+  module->walk([&](Operation *op) {
+    if (op->hasTrait<SymbolOpInterface::Trait>())
+      configMap[op] = &configSet;
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// Function Registry
+
+void PDLPatternModule::registerConstraintFunction(
+    StringRef name, PDLConstraintFunction constraintFn) {
+  // TODO: Is it possible to diagnose when `name` is already registered to
+  // a function that is not equivalent to `constraintFn`?
+  // Allow existing mappings in the case multiple patterns depend on the same
+  // constraint.
+  constraintFunctions.try_emplace(name, std::move(constraintFn));
+}
+
+void PDLPatternModule::registerRewriteFunction(StringRef name,
+                                               PDLRewriteFunction rewriteFn) {
+  // TODO: Is it possible to diagnose when `name` is already registered to
+  // a function that is not equivalent to `rewriteFn`?
+  // Allow existing mappings in the case multiple patterns depend on the same
+  // rewrite.
+  rewriteFunctions.try_emplace(name, std::move(rewriteFn));
+}
+
 //===----------------------------------------------------------------------===//
 // RewriterBase
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Rewrite/ByteCode.h b/mlir/lib/Rewrite/ByteCode.h
index 4aceac7ed3a4c..4d43fe636bd1f 100644
--- a/mlir/lib/Rewrite/ByteCode.h
+++ b/mlir/lib/Rewrite/ByteCode.h
@@ -16,8 +16,6 @@
 
 #include "mlir/IR/PatternMatch.h"
 
-#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
-
 namespace mlir {
 namespace pdl_interp {
 class RecordMatchOp;
@@ -226,38 +224,4 @@ class PDLByteCode {
 } // namespace detail
 } // namespace mlir
 
-#else
-
-namespace mlir::detail {
-
-class PDLByteCodeMutableState {
-public:
-  void cleanupAfterMatchAndRewrite() {}
-  void updatePatternBenefit(unsigned patternIndex, PatternBenefit benefit) {}
-};
-
-class PDLByteCodePattern : public Pattern {};
-
-class PDLByteCode {
-public:
-  struct MatchResult {
-    const PDLByteCodePattern *pattern = nullptr;
-    PatternBenefit benefit;
-  };
-
-  void initializeMutableState(PDLByteCodeMutableState &state) const {}
-  void match(Operation *op, PatternRewriter &rewriter,
-             SmallVectorImpl<MatchResult> &matches,
-             PDLByteCodeMutableState &state) const {}
-  LogicalResult rewrite(PatternRewriter &rewriter, const MatchResult &match,
-                        PDLByteCodeMutableState &state) const {
-    return failure();
-  }
-  ArrayRef<PDLByteCodePattern> getPatterns() const { return {}; }
-};
-
-} // namespace mlir::detail
-
-#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
-
 #endif // MLIR_REWRITE_BYTECODE_H_
diff --git a/mlir/lib/Rewrite/CMakeLists.txt b/mlir/lib/Rewrite/CMakeLists.txt
index a6c39406aa4b3..e0395be6cd6f5 100644
--- a/mlir/lib/Rewrite/CMakeLists.txt
+++ b/mlir/lib/Rewrite/CMakeLists.txt
@@ -1,6 +1,5 @@
-set(LLVM_OPTIONAL_SOURCES ByteCode.cpp)
-
 add_mlir_library(MLIRRewrite
+  ByteCode.cpp
   FrozenRewritePatternSet.cpp
   PatternApplicator.cpp
 
@@ -12,31 +11,8 @@ add_mlir_library(MLIRRewrite
 
   LINK_LIBS PUBLIC
   MLIRIR
+  MLIRPDLDialect
+  MLIRPDLInterpDialect
+  MLIRPDLToPDLInterp
   MLIRSideEffectInterfaces
   )
-
-if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
-  add_mlir_library(MLIRRewritePDL
-    ByteCode.cpp
-
-    ADDITIONAL_HEADER_DIRS
-    ${MLIR_MAIN_INCLUDE_DIR}/mlir/Rewrite
-
-    DEPENDS
-    mlir-generic-headers
-
-    LINK_LIBS PUBLIC
-    MLIRIR
-    MLIRPDLDialect
-    MLIRPDLInterpDialect
-    MLIRPDLToPDLInterp
-    MLIRSideEffectInterfaces
-  )
-
-  target_link_libraries(MLIRRewrite PUBLIC
-    MLIRPDLDialect
-    MLIRPDLInterpDialect
-    MLIRPDLToPDLInterp
-    MLIRRewritePDL)
-endif()
-
diff --git a/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp b/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
index 17fe02df9f66c..43840d1e8cec2 100644
--- a/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
+++ b/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
@@ -8,6 +8,8 @@
 
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "ByteCode.h"
+#include "mlir/Conversion/PDLToPDLInterp/PDLToPDLInterp.h"
+#include "mlir/Dialect/PDL/IR/PDLOps.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
@@ -15,11 +17,6 @@
 
 using namespace mlir;
 
-// Include the PDL rewrite support.
-#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
-#include "mlir/Conversion/PDLToPDLInterp/PDLToPDLInterp.h"
-#include "mlir/Dialect/PDL/IR/PDLOps.h"
-
 static LogicalResult
 convertPDLToPDLInterp(ModuleOp pdlModule,
                       DenseMap<Operation *, PDLPatternConfigSet *> &configMap) {
@@ -51,7 +48,6 @@ convertPDLToPDLInterp(ModuleOp pdlModule,
   pdlModule.getBody()->walk(simplifyFn);
   return success();
 }
-#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
 //===----------------------------------------------------------------------===//
 // FrozenRewritePatternSet
@@ -125,7 +121,6 @@ FrozenRewritePatternSet::FrozenRewritePatternSet(
     impl->nativeAnyOpPatterns.push_back(std::move(pat));
   }
 
-#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
   // Generate the bytecode for the PDL patterns if any were provided.
   PDLPatternModule &pdlPatterns = patterns.getPDLPatterns();
   ModuleOp pdlModule = pdlPatterns.getModule();
@@ -142,7 +137,6 @@ FrozenRewritePatternSet::FrozenRewritePatternSet(
       pdlModule, pdlPatterns.takeConfigs(), configMap,
       pdlPatterns.takeConstraintFunctions(),
       pdlPatterns.takeRewriteFunctions());
-#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 }
 
 FrozenRewritePatternSet::~FrozenRewritePatternSet() = default;
diff --git a/mlir/lib/Rewrite/PatternApplicator.cpp b/mlir/lib/Rewrite/PatternApplicator.cpp
index 0064eb84aba84..08d6ee618ac69 100644
--- a/mlir/lib/Rewrite/PatternApplicator.cpp
+++ b/mlir/lib/Rewrite/PatternApplicator.cpp
@@ -152,6 +152,7 @@ LogicalResult PatternApplicator::matchAndRewrite(
     // Find the next pattern with the highest benefit.
     const Pattern *bestPattern = nullptr;
     unsigned *bestPatternIt = &opIt;
+    const PDLByteCode::MatchResult *pdlMatch = nullptr;
 
     /// Operation specific patterns.
     if (opIt < opE)
@@ -163,8 +164,6 @@ LogicalResult PatternApplicator::matchAndRewrite(
       bestPatternIt = &anyIt;
       bestPattern = anyOpPatterns[anyIt];
     }
-
-    const PDLByteCode::MatchResult *pdlMatch = nullptr;
     /// PDL patterns.
     if (pdlIt < pdlE && (!bestPattern || bestPattern->getBenefit() <
                                              pdlMatches[pdlIt].benefit)) {
@@ -172,7 +171,6 @@ LogicalResult PatternApplicator::matchAndRewrite(
       pdlMatch = &pdlMatches[pdlIt];
       bestPattern = pdlMatch->pattern;
     }
-
     if (!bestPattern)
       break;
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 85433d088dcbf..4d2afe462b928 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Config/mlir-config.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -3313,7 +3312,6 @@ auto ConversionTarget::getOpInfo(OperationName op) const
   return std::nullopt;
 }
 
-#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 //===----------------------------------------------------------------------===//
 // PDL Configuration
 //===----------------------------------------------------------------------===//
@@ -3384,7 +3382,6 @@ void mlir::registerConversionPDLFunctions(RewritePatternSet &patterns) {
         return std::move(remappedTypes);
       });
 }
-#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
 //===----------------------------------------------------------------------===//
 // Op Conversion Entry Points
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 7ec4c8f0963a2..3f312164cb1f3 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -97,13 +97,16 @@ set(MLIR_TEST_DEPENDS
   mlir-capi-ir-test
   mlir-capi-llvm-test
   mlir-capi-pass-test
+  mlir-capi-pdl-test
   mlir-capi-quant-test
   mlir-capi-sparse-tensor-test
   mlir-capi-transform-test
   mlir-capi-translation-test
   mlir-linalg-ods-yaml-gen
   mlir-lsp-server
+  mlir-pdll-lsp-server
   mlir-opt
+  mlir-pdll
   mlir-query
   mlir-reduce
   mlir-tblgen
@@ -112,12 +115,6 @@ set(MLIR_TEST_DEPENDS
   tblgen-to-irdl
   )
 
-set(MLIR_TEST_DEPENDS ${MLIR_TEST_DEPENDS}
-  mlir-capi-pdl-test
-  mlir-pdll-lsp-server
-  mlir-pdll
-  )
-
 # The native target may not be enabled, in this case we won't
 # run tests that involves executing on the host: do not build
 # useless binaries.
@@ -162,10 +159,9 @@ if(LLVM_BUILD_EXAMPLES)
     toyc-ch3
     toyc-ch4
     toyc-ch5
-    )
-  list(APPEND MLIR_TEST_DEPENDS
     transform-opt-ch2
     transform-opt-ch3
+    mlir-minimal-opt
     )
   if(MLIR_ENABLE_EXECUTION_ENGINE)
     list(APPEND MLIR_TEST_DEPENDS
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 2a3a8608db544..e032ce7200fbf 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,8 +1,3 @@
-set(LLVM_OPTIONAL_SOURCES 
-  TestDialectConversion.cpp)
-set(MLIRTestTransformsPDLDep)
-set(MLIRTestTransformsPDLSrc)
-if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
 add_mlir_pdll_library(MLIRTestDialectConversionPDLLPatternsIncGen
   TestDialectConversion.pdll
   TestDialectConversionPDLLPatterns.h.inc
@@ -11,22 +6,17 @@ add_mlir_pdll_library(MLIRTestDialectConversionPDLLPatternsIncGen
   ${CMAKE_CURRENT_SOURCE_DIR}/../Dialect/Test
   ${CMAKE_CURRENT_BINARY_DIR}/../Dialect/Test
   )
-  set(MLIRTestTransformsPDLSrc
-    TestDialectConversion.cpp)
-  set(MLIRTestTransformsPDLDep
-    MLIRTestDialectConversionPDLLPatternsIncGen)
-endif()
 
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestTransforms
   TestCommutativityUtils.cpp
   TestConstantFold.cpp
   TestControlFlowSink.cpp
+  TestDialectConversion.cpp
   TestInlining.cpp
   TestIntRangeInference.cpp
   TestMakeIsolatedFromAbove.cpp
   TestTopologicalSort.cpp
-  ${MLIRTestTransformsPDLSrc}
 
   EXCLUDE_FROM_LIBMLIR
 
@@ -34,7 +24,7 @@ add_mlir_library(MLIRTestTransforms
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
 
   DEPENDS
-  ${MLIRTestTransformsPDLDep}
+  MLIRTestDialectConversionPDLLPatternsIncGen
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
diff --git a/mlir/tools/mlir-lsp-server/CMakeLists.txt b/mlir/tools/mlir-lsp-server/CMakeLists.txt
index 9664f6b94844e..e90ccf17af17f 100644
--- a/mlir/tools/mlir-lsp-server/CMakeLists.txt
+++ b/mlir/tools/mlir-lsp-server/CMakeLists.txt
@@ -21,12 +21,10 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestIR
     MLIRTestPass
     MLIRTestReducer
-    )
-  set(test_libs
-    ${test_libs}
     MLIRTestRewrite
     MLIRTestTransformDialect
-    MLIRTestTransforms)
+    MLIRTestTransforms
+    )
 endif()
 
 set(LIBS
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 15317a119c154..b6ada66d32188 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -38,18 +38,16 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestIR
     MLIRTestOneToNTypeConversionPass
     MLIRTestPass
+    MLIRTestPDLL
     MLIRTestReducer
+    MLIRTestRewrite
+    MLIRTestTransformDialect
     MLIRTestTransforms
     MLIRTilingInterfaceTestPasses
     MLIRVectorTestPasses
     MLIRTestVectorToSPIRV
     MLIRLLVMTestPasses
     )
-  set(test_libs ${test_libs}
-    MLIRTestPDLL
-    MLIRTestRewrite
-    MLIRTestTransformDialect
-    )
 endif()
 
 set(LIBS
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index bf8f3b7aa21d1..f7a5b3183b50b 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -85,9 +85,7 @@ void registerTestDataLayoutQuery();
 void registerTestDeadCodeAnalysisPass();
 void registerTestDecomposeCallGraphTypes();
 void registerTestDiagnosticsPass();
-#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 void registerTestDialectConversionPasses();
-#endif
 void registerTestDominancePass();
 void registerTestDynamicPipelinePass();
 void registerTestEmulateNarrowTypePass();
@@ -149,8 +147,8 @@ void registerTestNvgpuLowerings();
 
 namespace test {
 void registerTestDialect(DialectRegistry &);
-void registerTestDynDialect(DialectRegistry &);
 void registerTestTransformDialectExtension(DialectRegistry &);
+void registerTestDynDialect(DialectRegistry &);
 } // namespace test
 
 #ifdef MLIR_INCLUDE_TESTS
@@ -262,9 +260,6 @@ void registerTestPasses() {
   mlir::test::registerTestVectorReductionToSPIRVDotProd();
   mlir::test::registerTestNvgpuLowerings();
   mlir::test::registerTestWrittenToPass();
-#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
-  mlir::test::registerTestDialectConversionPasses();
-#endif
 }
 #endif
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2a72bf965e544..2a56b2d6f0373 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -35,7 +35,6 @@ expand_template(
     substitutions = {
         "#cmakedefine01 MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS": "#define MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS 0",
         "#cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}": "/* #undef MLIR_GREEDY_REWRITE_RANDOMIZER_SEED */",
-        "#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH": "#define MLIR_ENABLE_PDL_IN_PATTERNMATCH 1",
     },
     template = "include/mlir/Config/mlir-config.h.cmake",
 )
@@ -319,13 +318,11 @@ cc_library(
     srcs = glob([
         "lib/IR/*.cpp",
         "lib/IR/*.h",
-        "lib/IR/PDL/*.cpp",
         "lib/Bytecode/Reader/*.h",
         "lib/Bytecode/Writer/*.h",
         "lib/Bytecode/*.h",
     ]) + [
         "lib/Bytecode/BytecodeOpInterface.cpp",
-        "include/mlir/IR/PDLPatternMatch.h.inc",
     ],
     hdrs = glob([
         "include/mlir/IR/*.h",
@@ -348,7 +345,6 @@ cc_library(
         ":BuiltinTypesIncGen",
         ":BytecodeOpInterfaceIncGen",
         ":CallOpInterfacesIncGen",
-        ":config",
         ":DataLayoutInterfacesIncGen",
         ":InferTypeOpInterfaceIncGen",
         ":OpAsmInterfaceIncGen",

From e8df7e2c1adad376647ba6776e39b019003370a6 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Wed, 3 Jan 2024 18:27:04 +0000
Subject: [PATCH 161/313] [LLVM][AArch64][tblgen]: Make operand suffix
 case-insinsitive (#76808)

Ex: `fdot v26.8H, v22.16B, v9.2B[0]` should be equivalent to `fdot
v26.8h, v22.16b, v9.2b[0]`
---
 llvm/lib/Target/AArch64/AArch64InstrFormats.td | 1 +
 llvm/test/MC/AArch64/FP8/dot.s                 | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index cb63d8726744d..10ad5b1f8f258 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -12586,6 +12586,7 @@ def : TokenAlias<".4S", ".4s">;
 def : TokenAlias<".2D", ".2d">;
 def : TokenAlias<".1Q", ".1q">;
 def : TokenAlias<".2H", ".2h">;
+def : TokenAlias<".2B", ".2b">;
 def : TokenAlias<".B", ".b">;
 def : TokenAlias<".H", ".h">;
 def : TokenAlias<".S", ".s">;
diff --git a/llvm/test/MC/AArch64/FP8/dot.s b/llvm/test/MC/AArch64/FP8/dot.s
index e755430745c34..cfcc1137231a9 100644
--- a/llvm/test/MC/AArch64/FP8/dot.s
+++ b/llvm/test/MC/AArch64/FP8/dot.s
@@ -44,6 +44,12 @@ fdot  v31.4h, v31.8b, v15.2b[0]
 // CHECK-ERROR: instruction requires: fp8dot2
 // CHECK-UNKNOWN: 0f4f03ff <unknown>
 
+fdot v26.8H, v22.16B, v9.2B[0]
+// CHECK-INST: fdot  v26.8h, v22.16b, v9.2b[0]
+// CHECK-ENCODING: [0xda,0x02,0x49,0x4f]
+// CHECK-ERROR: instruction requires: fp8dot2
+// CHECK-UNKNOWN: 4f4902da <unknown>
+
 fdot  v0.8h, v0.16b, v15.2b[7]
 // CHECK-INST: fdot  v0.8h, v0.16b, v15.2b[7]
 // CHECK-ENCODING: [0x00,0x08,0x7f,0x4f]

From ee94e548a1e82c4d519fe2a4d96dfa08b697195c Mon Sep 17 00:00:00 2001
From: Craig Hesling <craig@hesling.com>
Date: Wed, 3 Jan 2024 13:35:39 -0500
Subject: [PATCH 162/313] [GitHub] Fix minor typos in .github/workflows
 (#76592)

Fix one spelling typo and remove second newline from end of files.
---
 .github/workflows/README.md                  | 2 +-
 .github/workflows/docs.yml                   | 1 -
 .github/workflows/libcxx-build-and-test.yaml | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 2781d9cc0e63d..ce34d2337e9c5 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -1 +1 @@
-Github action workflows should be stored in this directrory.
+Github action workflows should be stored in this directory.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 9c695e714edd6..bf559964ff97b 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -167,4 +167,3 @@ jobs:
         run: |
           cmake -B flang-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;mlir;flang" -DLLVM_ENABLE_SPHINX=ON -DSPHINX_WARNINGS_AS_ERRORS=OFF ./llvm
           TZ=UTC ninja -C flang-build docs-flang-html docs-flang-man
-
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 25e8c8c1ef21a..3fd49541fd7cb 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -207,4 +207,3 @@ jobs:
             **/CMakeError.log
             **/CMakeOutput.log
             **/crash_diagnostics/*
-  

From 7c963fde16d8ba340d6a6ed044b9c775f9bfab48 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 3 Jan 2024 10:36:01 -0800
Subject: [PATCH 163/313] [SLP]Use revectorized value for extracts from
 buildvector, beeing vectorized.

If the insertelement instruction is vectorized, and the extractelement
instruction from such insertelement also vectorized as part of the same
tree, need to extract from the corresponding for insertelement vectorized value rather than original insertelement instruction.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  2 ++
 .../X86/gather_extract_from_vectorbuild.ll    | 27 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3049915260646..bd89ec09671a9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11139,6 +11139,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
     case Instruction::ExtractElement: {
       Value *V = E->getSingleOperand(0);
+      if (const TreeEntry *TE = getTreeEntry(V))
+        V = TE->VectorizedValue;
       setInsertPointAfterBundle(E);
       V = FinalShuffle(V, E, VecTy, IsSigned);
       E->VectorizedValue = V;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather_extract_from_vectorbuild.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather_extract_from_vectorbuild.ll
index 3bccfac8566de..b762c3a4f1858 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather_extract_from_vectorbuild.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather_extract_from_vectorbuild.ll
@@ -31,3 +31,30 @@ loop:
   %i4 = extractelement <2 x float> %ins1, i64 0
   br label %loop
 }
+
+define void @test1() {
+; CHECK-LABEL: define void @test1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[TMP2]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP1]], <2 x float> zeroinitializer
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %ph0 = phi float [ 0.000000e+00, %entry ], [ %i4, %loop ]
+  %ph1 = phi float [ 0.000000e+00, %entry ], [ %i5, %loop ]
+  %i = fadd float 0.000000e+00, %ph0
+  %i1 = fadd float 0.000000e+00, %ph1
+  %i2 = select i1 false, float %i, float 0.000000e+00
+  %i3 = select i1 false, float %i1, float 0.000000e+00
+  %ins0 = insertelement <2 x float> zeroinitializer, float %i2, i64 0
+  %ins1 = insertelement <2 x float> %ins0, float %i3, i64 1
+  %i4 = extractelement <2 x float> %ins1, i64 0
+  %i5 = extractelement <2 x float> %ins1, i64 1
+  br label %loop
+}

From 7fbc1de9896029636dd572a692ee90ba88285943 Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Wed, 3 Jan 2024 14:07:46 -0500
Subject: [PATCH 164/313] [Clang][Sema] Diagnose unexpanded packs in the
 template argument lists of function template specializations (#76677)

This diagnoses unexpanded packs in the _unqualified-id_ of a function
template specialization's _declarator-id_. For example:
```cpp
template<typename... Ts>
struct A
{
    template<typename U>
    void f();

    template<>
    void f<Ts>(); // error: explicit specialization contains unexpanded parameter pack 'Ts'
};
```

I moved the handling of template-id's so it happens right after we
determine whether we are declaring a function template/function template
specialization so diagnostics are issued in lexical order.
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaDecl.cpp                   | 89 +++++++++----------
 .../CXX/temp/temp.decls/temp.variadic/p5.cpp  | 12 +++
 3 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a3107c4a69532..778ce0e0e52d0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -518,6 +518,7 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses definitions of friend function specializations, e.g. ``friend void f<>(int) {}``.
 - Clang now diagnoses narrowing conversions involving const references.
   (`#63151: <https://github.com/llvm/llvm-project/issues/63151>`_).
+- Clang now diagnoses unexpanded packs within the template argument lists of function template specializations.
 
 
 Improvements to Clang's time-trace
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 2de631941325f..8e46c4984d93d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9900,15 +9900,15 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     // Match up the template parameter lists with the scope specifier, then
     // determine whether we have a template or a template specialization.
     bool Invalid = false;
+    TemplateIdAnnotation *TemplateId =
+        D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId
+            ? D.getName().TemplateId
+            : nullptr;
     TemplateParameterList *TemplateParams =
         MatchTemplateParametersToScopeSpecifier(
             D.getDeclSpec().getBeginLoc(), D.getIdentifierLoc(),
-            D.getCXXScopeSpec(),
-            D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId
-                ? D.getName().TemplateId
-                : nullptr,
-            TemplateParamLists, isFriend, isMemberSpecialization,
-            Invalid);
+            D.getCXXScopeSpec(), TemplateId, TemplateParamLists, isFriend,
+            isMemberSpecialization, Invalid);
     if (TemplateParams) {
       // Check that we can declare a template here.
       if (CheckTemplateDeclScope(S, TemplateParams))
@@ -9921,6 +9921,11 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
         if (Name.getNameKind() == DeclarationName::CXXDestructorName) {
           Diag(NewFD->getLocation(), diag::err_destructor_template);
           NewFD->setInvalidDecl();
+          // Function template with explicit template arguments.
+        } else if (TemplateId) {
+          Diag(D.getIdentifierLoc(), diag::err_function_template_partial_spec)
+              << SourceRange(TemplateId->LAngleLoc, TemplateId->RAngleLoc);
+          NewFD->setInvalidDecl();
         }
 
         // If we're adding a template to a dependent context, we may need to
@@ -9973,6 +9978,11 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
             << FixItHint::CreateRemoval(RemoveRange)
             << FixItHint::CreateInsertion(InsertLoc, "<>");
           Invalid = true;
+
+          // Recover by faking up an empty template argument list.
+          HasExplicitTemplateArgs = true;
+          TemplateArgs.setLAngleLoc(InsertLoc);
+          TemplateArgs.setRAngleLoc(InsertLoc);
         }
       }
     } else {
@@ -9986,6 +9996,33 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
       if (TemplateParamLists.size() > 0)
         // For source fidelity, store all the template param lists.
         NewFD->setTemplateParameterListsInfo(Context, TemplateParamLists);
+
+      // "friend void foo<>(int);" is an implicit specialization decl.
+      if (isFriend && TemplateId)
+        isFunctionTemplateSpecialization = true;
+    }
+
+    // If this is a function template specialization and the unqualified-id of
+    // the declarator-id is a template-id, convert the template argument list
+    // into our AST format and check for unexpanded packs.
+    if (isFunctionTemplateSpecialization && TemplateId) {
+      HasExplicitTemplateArgs = true;
+
+      TemplateArgs.setLAngleLoc(TemplateId->LAngleLoc);
+      TemplateArgs.setRAngleLoc(TemplateId->RAngleLoc);
+      ASTTemplateArgsPtr TemplateArgsPtr(TemplateId->getTemplateArgs(),
+                                         TemplateId->NumArgs);
+      translateTemplateArguments(TemplateArgsPtr, TemplateArgs);
+
+      // FIXME: Should we check for unexpanded packs if this was an (invalid)
+      // declaration of a function template partial specialization? Should we
+      // consider the unexpanded pack context to be a partial specialization?
+      for (const TemplateArgumentLoc &ArgLoc : TemplateArgs.arguments()) {
+        if (DiagnoseUnexpandedParameterPack(
+                ArgLoc, isFriend ? UPPC_FriendDeclaration
+                                 : UPPC_ExplicitSpecialization))
+          NewFD->setInvalidDecl();
+      }
     }
 
     if (Invalid) {
@@ -10438,46 +10475,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
            diag::ext_operator_new_delete_declared_inline)
         << NewFD->getDeclName();
 
-    // If the declarator is a template-id, translate the parser's template
-    // argument list into our AST format.
-    if (D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId) {
-      TemplateIdAnnotation *TemplateId = D.getName().TemplateId;
-      TemplateArgs.setLAngleLoc(TemplateId->LAngleLoc);
-      TemplateArgs.setRAngleLoc(TemplateId->RAngleLoc);
-      ASTTemplateArgsPtr TemplateArgsPtr(TemplateId->getTemplateArgs(),
-                                         TemplateId->NumArgs);
-      translateTemplateArguments(TemplateArgsPtr,
-                                 TemplateArgs);
-
-      HasExplicitTemplateArgs = true;
-
-      if (NewFD->isInvalidDecl()) {
-        HasExplicitTemplateArgs = false;
-      } else if (FunctionTemplate) {
-        // Function template with explicit template arguments.
-        Diag(D.getIdentifierLoc(), diag::err_function_template_partial_spec)
-          << SourceRange(TemplateId->LAngleLoc, TemplateId->RAngleLoc);
-
-        HasExplicitTemplateArgs = false;
-      } else if (isFriend) {
-        // "friend void foo<>(int);" is an implicit specialization decl.
-        isFunctionTemplateSpecialization = true;
-      } else {
-        assert(isFunctionTemplateSpecialization &&
-               "should have a 'template<>' for this decl");
-      }
-    } else if (isFriend && isFunctionTemplateSpecialization) {
-      // This combination is only possible in a recovery case;  the user
-      // wrote something like:
-      //   template <> friend void foo(int);
-      // which we're recovering from as if the user had written:
-      //   friend void foo<>(int);
-      // Go ahead and fake up a template id.
-      HasExplicitTemplateArgs = true;
-      TemplateArgs.setLAngleLoc(D.getIdentifierLoc());
-      TemplateArgs.setRAngleLoc(D.getIdentifierLoc());
-    }
-
     // We do not add HD attributes to specializations here because
     // they may have different constexpr-ness compared to their
     // templates and, after maybeAddCUDAHostDeviceAttrs() is applied,
diff --git a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
index 30ce6b40e1fb5..3c500c2c4dc4a 100644
--- a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
+++ b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
@@ -376,6 +376,11 @@ namespace Specializations {
   template<typename... Ts>
   struct PrimaryClass<Ts>; // expected-error{{partial specialization contains unexpanded parameter pack 'Ts'}}
 
+  template<typename T, typename... Ts>
+  void PrimaryFunction();
+  template<typename T, typename... Ts>
+  void PrimaryFunction<Ts>(); // expected-error{{function template partial specialization is not allowed}}
+
 #if __cplusplus >= 201402L
   template<typename T, typename... Ts>
   constexpr int PrimaryVar = 0;
@@ -392,6 +397,13 @@ namespace Specializations {
     template<typename U>
     struct InnerClass<U, Ts>; // expected-error{{partial specialization contains unexpanded parameter pack 'Ts'}}
 
+    template<typename... Us>
+    void InnerFunction();
+    template<>
+    void InnerFunction<Ts>(); // expected-error{{explicit specialization contains unexpanded parameter pack 'Ts'}}
+
+    friend void PrimaryFunction<Ts>(); // expected-error{{friend declaration contains unexpanded parameter pack 'Ts'}}
+
 #if __cplusplus >= 201402L
     template<typename... Us>
     constexpr static int InnerVar = 0;

From 47a1704ac94c8aeb1aa7e0fc438ff99d36b632c6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 3 Jan 2024 11:54:29 -0800
Subject: [PATCH 165/313] [SelectionDAG][X86] Use disjoint flag in
 SelectionDAG::isADDLike. (#76847)

Keep the haveNoCommonBitsSet check because we haven't started inferring
the flag yet.

I've added tests for two transforms, but these are not the only
transforms that use isADDLike.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  3 +-
 .../CodeGen/X86/addsub-constant-folding.ll    | 36 +++++++++++++++++++
 llvm/test/CodeGen/X86/or-lea.ll               | 20 +++++++++++
 4 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index eafa95ce7fcf7..bb44ac1fba486 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7987,7 +7987,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 
   // If OR can be rewritten into ADD, try combines based on ADD.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
-      DAG.haveNoCommonBitsSet(N0, N1))
+      DAG.isADDLike(SDValue(N, 0)))
     if (SDValue Combined = visitADDLike(N))
       return Combined;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 75c10a74cdc44..4151964adc7db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5088,7 +5088,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
 bool SelectionDAG::isADDLike(SDValue Op) const {
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::OR)
-    return haveNoCommonBitsSet(Op.getOperand(0), Op.getOperand(1));
+    return Op->getFlags().hasDisjoint() ||
+           haveNoCommonBitsSet(Op.getOperand(0), Op.getOperand(1));
   if (Opcode == ISD::XOR)
     return isMinSignedConstant(Op.getOperand(1));
   return false;
diff --git a/llvm/test/CodeGen/X86/addsub-constant-folding.ll b/llvm/test/CodeGen/X86/addsub-constant-folding.ll
index de215c80dcd1a..4dbaae5c1a74a 100644
--- a/llvm/test/CodeGen/X86/addsub-constant-folding.ll
+++ b/llvm/test/CodeGen/X86/addsub-constant-folding.ll
@@ -1141,3 +1141,39 @@ define <4 x i32> @vec_const_sub_const_sub_nonsplat(<4 x i32> %arg) {
   %t1 = sub <4 x i32> <i32 2, i32 3, i32 undef, i32 2>, %t0
   ret <4 x i32> %t1
 }
+
+; (x|c1)+c2 where (x|c1) is addlike
+define i32 @add_const_disjoint_or_const(i32 %arg) {
+; X86-LABEL: add_const_disjoint_or_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $10, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: add_const_disjoint_or_const:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal 10(%rdi), %eax
+; X64-NEXT:    retq
+  %t0 = or disjoint i32 %arg, 8
+  %t1 = add i32 %t0, 2
+  ret i32 %t1
+}
+
+; (x+c1)|c2 where the outer or is addlike
+define i32 @disjoint_or_const_add_const(i32 %arg) {
+; X86-LABEL: disjoint_or_const_add_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $10, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: disjoint_or_const_add_const:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal 10(%rdi), %eax
+; X64-NEXT:    retq
+  %t0 = add i32 %arg, 8
+  %t1 = or disjoint i32 %t0, 2
+  ret i32 %t1
+}
diff --git a/llvm/test/CodeGen/X86/or-lea.ll b/llvm/test/CodeGen/X86/or-lea.ll
index ab9b917803248..616ab99437892 100644
--- a/llvm/test/CodeGen/X86/or-lea.ll
+++ b/llvm/test/CodeGen/X86/or-lea.ll
@@ -825,3 +825,23 @@ entry:
   %or = or i64 %sub, 549755813889   ; 0x8000000001
   ret i64 %or
 }
+
+define i32 @or_shift1_disjoint(i32 %x, i32 %y) {
+; X86-LABEL: or_shift1_disjoint:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: or_shift1_disjoint:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rsi,%rdi,2), %eax
+; X64-NEXT:    retq
+  %shl = shl i32 %x, 1
+  %or = or disjoint i32 %y, %shl
+  ret i32 %or
+}
+

From f64d1c810a2b8d89c3760cefb957da499c087404 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 3 Jan 2024 12:04:43 -0800
Subject: [PATCH 166/313] [RISCV] Add test cases for folding disjoint Or into a
 scalar load address. NFC

After 47a1704ac94c8aeb1aa7e0fc438ff99d36b632c6 we are able to
reassociate a disjoint Or used as a GEP index to get the constant
closer to a load to fold it. This is show by the first test.

We are not able to do this if the GEP created a shift left to scale
the index as the second test shows.

To make this work, we need to preserve the disjoint flag when pulling
the Or through the shift.
---
 llvm/test/CodeGen/RISCV/mem.ll | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll
index 3718ce80142d4..8f65973d4fde9 100644
--- a/llvm/test/CodeGen/RISCV/mem.ll
+++ b/llvm/test/CodeGen/RISCV/mem.ll
@@ -338,3 +338,29 @@ bb:
 }
 
 declare void @snork(ptr)
+
+define i8 @disjoint_or_lb(ptr %a, i32 %off) nounwind {
+; RV32I-LABEL: disjoint_or_lb:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    ret
+  %b = or disjoint i32 %off, 3
+  %1 = getelementptr i8, ptr %a, i32 %b
+  %2 = load i8, ptr %1
+  ret i8 %2
+}
+
+define i32 @disjoint_or_lw(ptr %a, i32 %off) nounwind {
+; RV32I-LABEL: disjoint_or_lw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    ori a1, a1, 12
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    ret
+  %b = or disjoint i32 %off, 3
+  %1 = getelementptr i32, ptr %a, i32 %b
+  %2 = load i32, ptr %1
+  ret i32 %2
+}

From ddd6acd7a8e9296544bce49af7178f3eeb318e61 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Wed, 3 Jan 2024 14:28:15 -0600
Subject: [PATCH 167/313] [mlir][GPU] Expand LLVM function attribute copies
 (#76755)

Expand the copying of attributes on GPU kernel arguments during LLVM
lowering.

Support copying attributes from values that are already LLVM pointers.

Support copying attributes, like `noundef`, that aren't specific to (the
pointer parts of) arguments.
---
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   | 68 ++++++++++++-------
 .../GPUCommon/memref-arg-attrs.mlir           | 25 +++++++
 2 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index e79a02f931af2..6a005e67ca95b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -26,9 +26,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
 
   SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
   workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
-  for (const auto &en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
-    BlockArgument attribution = en.value();
-
+  for (const auto [idx, attribution] :
+       llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
     auto type = dyn_cast<MemRefType>(attribution.getType());
     assert(type && type.hasStaticShape() && "unexpected type in attribution");
 
@@ -37,12 +36,12 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     auto elementType =
         cast<Type>(typeConverter->convertType(type.getElementType()));
     auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
-    std::string name = std::string(
-        llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
+    std::string name =
+        std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
     uint64_t alignment = 0;
     if (auto alignAttr =
             dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getWorkgroupAttributionAttr(
-                en.index(), LLVM::LLVMDialect::getAlignAttrName())))
+                idx, LLVM::LLVMDialect::getAlignAttrName())))
       alignment = alignAttr.getInt();
     auto globalOp = rewriter.create<LLVM::GlobalOp>(
         gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
@@ -105,8 +104,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     rewriter.setInsertionPointToStart(&gpuFuncOp.front());
     unsigned numProperArguments = gpuFuncOp.getNumArguments();
 
-    for (const auto &en : llvm::enumerate(workgroupBuffers)) {
-      LLVM::GlobalOp global = en.value();
+    for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
       auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
                                                 global.getAddrSpace());
       Value address = rewriter.create<LLVM::AddressOfOp>(
@@ -119,18 +117,18 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
       // existing memref infrastructure. This may use more registers than
       // otherwise necessary given that memref sizes are fixed, but we can try
       // and canonicalize that away later.
-      Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
+      Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
       auto type = cast<MemRefType>(attribution.getType());
       auto descr = MemRefDescriptor::fromStaticShape(
           rewriter, loc, *getTypeConverter(), type, memory);
-      signatureConversion.remapInput(numProperArguments + en.index(), descr);
+      signatureConversion.remapInput(numProperArguments + idx, descr);
     }
 
     // Rewrite private memory attributions to alloca'ed buffers.
     unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
     auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
-    for (const auto &en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
-      Value attribution = en.value();
+    for (const auto [idx, attribution] :
+         llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
       auto type = cast<MemRefType>(attribution.getType());
       assert(type && type.hasStaticShape() && "unexpected type in attribution");
 
@@ -145,14 +143,14 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
       uint64_t alignment = 0;
       if (auto alignAttr =
               dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getPrivateAttributionAttr(
-                  en.index(), LLVM::LLVMDialect::getAlignAttrName())))
+                  idx, LLVM::LLVMDialect::getAlignAttrName())))
         alignment = alignAttr.getInt();
       Value allocated = rewriter.create<LLVM::AllocaOp>(
           gpuFuncOp.getLoc(), ptrType, elementType, numElements, alignment);
       auto descr = MemRefDescriptor::fromStaticShape(
           rewriter, loc, *getTypeConverter(), type, allocated);
       signatureConversion.remapInput(
-          numProperArguments + numWorkgroupAttributions + en.index(), descr);
+          numProperArguments + numWorkgroupAttributions + idx, descr);
     }
   }
 
@@ -169,15 +167,16 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   if (getTypeConverter()->getOptions().useBarePtrCallConv) {
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPointToStart(&llvmFuncOp.getBody().front());
-    for (const auto &en : llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
-      auto memrefTy = dyn_cast<MemRefType>(en.value());
+    for (const auto [idx, argTy] :
+         llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
+      auto memrefTy = dyn_cast<MemRefType>(argTy);
       if (!memrefTy)
         continue;
       assert(memrefTy.hasStaticShape() &&
              "Bare pointer convertion used with dynamically-shaped memrefs");
       // Use a placeholder when replacing uses of the memref argument to prevent
       // circular replacements.
-      auto remapping = signatureConversion.getInputMapping(en.index());
+      auto remapping = signatureConversion.getInputMapping(idx);
       assert(remapping && remapping->size == 1 &&
              "Type converter should produce 1-to-1 mapping for bare memrefs");
       BlockArgument newArg =
@@ -193,19 +192,23 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
 
   // Get memref type from function arguments and set the noalias to
   // pointer arguments.
-  for (const auto &en : llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
-    auto memrefTy = en.value().dyn_cast<MemRefType>();
-    NamedAttrList argAttr = argAttrs
-                                ? argAttrs[en.index()].cast<DictionaryAttr>()
-                                : NamedAttrList();
-
+  for (const auto [idx, argTy] :
+       llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
+    auto remapping = signatureConversion.getInputMapping(idx);
+    NamedAttrList argAttr =
+        argAttrs ? argAttrs[idx].cast<DictionaryAttr>() : NamedAttrList();
+    auto copyAttribute = [&](StringRef attrName) {
+      Attribute attr = argAttr.erase(attrName);
+      if (!attr)
+        return;
+      for (size_t i = 0, e = remapping->size; i < e; ++i)
+        llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
+    };
     auto copyPointerAttribute = [&](StringRef attrName) {
       Attribute attr = argAttr.erase(attrName);
 
-      // This is a proxy for the bare pointer calling convention.
       if (!attr)
         return;
-      auto remapping = signatureConversion.getInputMapping(en.index());
       if (remapping->size > 1 &&
           attrName == LLVM::LLVMDialect::getNoAliasAttrName()) {
         emitWarning(llvmFuncOp.getLoc(),
@@ -224,10 +227,23 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     if (argAttr.empty())
       continue;
 
-    if (memrefTy) {
+    copyAttribute(LLVM::LLVMDialect::getReturnedAttrName());
+    copyAttribute(LLVM::LLVMDialect::getNoUndefAttrName());
+    copyAttribute(LLVM::LLVMDialect::getInRegAttrName());
+    bool lowersToPointer = false;
+    for (size_t i = 0, e = remapping->size; i < e; ++i) {
+      lowersToPointer |= isa<LLVM::LLVMPointerType>(
+          llvmFuncOp.getArgument(remapping->inputNo + i).getType());
+    }
+
+    if (lowersToPointer) {
       copyPointerAttribute(LLVM::LLVMDialect::getNoAliasAttrName());
+      copyPointerAttribute(LLVM::LLVMDialect::getNoCaptureAttrName());
+      copyPointerAttribute(LLVM::LLVMDialect::getNoFreeAttrName());
+      copyPointerAttribute(LLVM::LLVMDialect::getAlignAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getReadonlyAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getWriteOnlyAttrName());
+      copyPointerAttribute(LLVM::LLVMDialect::getReadnoneAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getNonNullAttrName());
       copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
       copyPointerAttribute(
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
index 33374984eb7c9..e7c742067b4eb 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
@@ -24,6 +24,17 @@ gpu.module @kernel {
 // ROCDL-SAME:  !llvm.ptr {llvm.writeonly}
 //  NVVM-SAME:  !llvm.ptr {llvm.writeonly}
 
+// -----
+
+gpu.module @kernel {
+  gpu.func @test_func_readonly_ptr(%arg0 : !llvm.ptr {llvm.readonly} ) {
+    gpu.return
+  }
+}
+
+// CHECK-LABEL:  llvm.func @test_func_readonly_ptr
+// ROCDL-SAME:  !llvm.ptr {llvm.readonly}
+//  NVVM-SAME:  !llvm.ptr {llvm.readonly}
 
 // -----
 
@@ -62,3 +73,17 @@ gpu.module @kernel {
 // CHECK-LABEL:  llvm.func @test_func_dereferenceable_or_null
 // ROCDL-SAME:  !llvm.ptr {llvm.dereferenceable_or_null = 4 : i64}
 //  NVVM-SAME:  !llvm.ptr {llvm.dereferenceable_or_null = 4 : i64}
+
+// -----
+
+gpu.module @kernel {
+  gpu.func @test_func_noundef(%arg0 : memref<f32> {llvm.noundef} ) {
+    gpu.return
+  }
+}
+
+// CHECK-LABEL:  llvm.func @test_func_noundef
+// ROCDL-SAME:  !llvm.ptr {llvm.noundef}
+// ROCDL-SAME:  i64 {llvm.noundef}
+//  NVVM-SAME:  !llvm.ptr {llvm.noundef}
+//  NVVM-SAME:  i64 {llvm.noundef}

From 92e211ab33417dc061bfbce910fd2c2419e11efa Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Wed, 3 Jan 2024 12:51:40 -0800
Subject: [PATCH 168/313] [BasicAA] Enable separate storage hints by default
 (#76864)

As requested in
https://github.com/llvm/llvm-project/pull/76770#pullrequestreview-1801649466

A few months of experimentation in a large codebase did not reveal any
significant build speed regressions, and b07bf16 speeds up hint lookup
even further.

Co-authored-by: David Goldblatt <davidgoldblatt@meta.com>
---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 4ad70931fd4cf..97f60d28e4991 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -69,7 +69,7 @@ static cl::opt<bool> EnableRecPhiAnalysis("basic-aa-recphi", cl::Hidden,
                                           cl::init(true));
 
 static cl::opt<bool> EnableSeparateStorageAnalysis("basic-aa-separate-storage",
-                                                   cl::Hidden, cl::init(false));
+                                                   cl::Hidden, cl::init(true));
 
 /// SearchLimitReached / SearchTimes shows how often the limit of
 /// to decompose GEPs is reached. It will affect the precision

From 6dda74cc51dd33e266101bccd3647658e5ab3c35 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 3 Jan 2024 20:54:08 +0000
Subject: [PATCH 169/313] [VPlan] Use createSelect in
 adjustRecipesForReductions (NFCI).

Simplify the code and rename Result->NewExitingVPV as suggested by
@ayalz in https://github.com/llvm/llvm-project/pull/70253.
---
 .../Vectorize/LoopVectorizationPlanner.h      | 11 +++++--
 .../Transforms/Vectorize/LoopVectorize.cpp    | 29 +++++++++----------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 577ce8000de27..150ab301e7fd2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -167,9 +167,14 @@ class VPBuilder {
   }
 
   VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
-                        DebugLoc DL, const Twine &Name = "") {
-    return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,
-                        Name);
+                        DebugLoc DL, const Twine &Name = "",
+                        std::optional<FastMathFlags> FMFs = std::nullopt) {
+    auto *Select =
+        FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
+                                 *FMFs, DL, Name)
+             : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
+                                 DL, Name);
+    return tryInsertInstruction(Select);
   }
 
   /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8e135d80f4f24..f5f04615eedee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9141,7 +9141,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       continue;
 
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-    auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
+    auto *NewExitingVPV = PhiR->getBackedgeValue();
     // If tail is folded by masking, introduce selects between the phi
     // and the live-out instruction of each reduction, at the beginning of the
     // dedicated latch block.
@@ -9151,21 +9151,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       VPValue *Red = PhiR->getBackedgeValue();
       assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
              "reduction recipe must be defined before latch");
-      FastMathFlags FMFs = RdxDesc.getFastMathFlags();
       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
-      Result =
+      std::optional<FastMathFlags> FMFs =
           PhiTy->isFloatingPointTy()
-              ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
-              : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
-      Result->insertBefore(&*Builder.getInsertPoint());
-      Red->replaceUsesWithIf(
-          Result->getVPSingleValue(),
-          [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
+              ? std::make_optional(RdxDesc.getFastMathFlags())
+              : std::nullopt;
+      NewExitingVPV = Builder.createSelect(Cond, Red, PhiR, {}, "", FMFs);
+      Red->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
+        return isa<VPLiveOut>(&U);
+      });
       if (PreferPredicatedReductionSelect ||
           TTI.preferPredicatedReductionSelect(
               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
               TargetTransformInfo::ReductionFlags()))
-        PhiR->setOperand(1, Result->getVPSingleValue());
+        PhiR->setOperand(1, NewExitingVPV);
     }
     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
@@ -9174,17 +9173,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
       Type *RdxTy = RdxDesc.getRecurrenceType();
-      auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
-                                          Result->getVPSingleValue(), RdxTy);
+      auto *Trunc =
+          new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
       auto *Extnd =
           RdxDesc.isSigned()
               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
 
-      Trunc->insertAfter(Result);
+      Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
       Extnd->insertAfter(Trunc);
-      Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
-      Trunc->setOperand(0, Result->getVPSingleValue());
+      NewExitingVPV->replaceAllUsesWith(Extnd);
+      Trunc->setOperand(0, NewExitingVPV);
     }
   }
 

From a24c58140fbd73d5b98f8fa88cf9a9dbf0613a41 Mon Sep 17 00:00:00 2001
From: Muhammad Omair Javaid <omair.javaid@linaro.org>
Date: Thu, 4 Jan 2024 02:01:16 +0500
Subject: [PATCH 170/313] Revert "[mlir] Consider mlir-linalg-ods-gen as a
 tablegen tool in build (#75093)"

This reverts commit 9191ac0bdb07643eefcc161c88b66d4e7058db9c.

Breaks build on following buildbot:
https://lab.llvm.org/buildbot/#/builders/177/builds/27432
---
 llvm/cmake/modules/TableGen.cmake             | 12 +++----
 mlir/CMakeLists.txt                           |  2 --
 mlir/cmake/modules/AddMLIR.cmake              | 25 -------------
 mlir/cmake/modules/CMakeLists.txt             |  2 --
 mlir/cmake/modules/MLIRConfig.cmake.in        |  1 -
 .../mlir/Dialect/Linalg/IR/CMakeLists.txt     | 35 ++++++++++++++++++-
 mlir/lib/Dialect/Linalg/IR/CMakeLists.txt     |  1 -
 mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt | 28 +++++++--------
 8 files changed, 52 insertions(+), 54 deletions(-)

diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 359d5217a22cb..1d18fdde2bb98 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -165,18 +165,14 @@ function(add_public_tablegen_target target)
 endfunction()
 
 macro(add_tablegen target project)
-  cmake_parse_arguments(ADD_TABLEGEN "SKIP_COMPONENT_LINK" "DESTINATION;EXPORT" "" ${ARGN})
+  cmake_parse_arguments(ADD_TABLEGEN "" "DESTINATION;EXPORT" "" ${ARGN})
 
-  if(NOT ADD_TABLEGEN_SKIP_COMPONENT_LINK)
-    set(${target}_OLD_LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS})
-    set(LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS} TableGen)
-  endif()
+  set(${target}_OLD_LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS})
+  set(LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS} TableGen)
 
   add_llvm_executable(${target} DISABLE_LLVM_LINK_LLVM_DYLIB
     ${ADD_TABLEGEN_UNPARSED_ARGUMENTS})
-  if(NOT ADD_TABLEGEN_SKIP_COMPONENT_LINK)
-    set(LLVM_LINK_COMPONENTS ${${target}_OLD_LLVM_LINK_COMPONENTS})
-  endif()
+  set(LLVM_LINK_COMPONENTS ${${target}_OLD_LLVM_LINK_COMPONENTS})
 
   set(${project}_TABLEGEN_DEFAULT "${target}")
   if (LLVM_NATIVE_TOOL_DIR)
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index dcc068e4097c5..16ff950089734 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -184,8 +184,6 @@ add_subdirectory(tools/mlir-pdll)
 
 set(MLIR_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_TABLEGEN_TARGET "${MLIR_TABLEGEN_TARGET}" CACHE INTERNAL "")
-set(MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE "${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE}" CACHE INTERNAL "")
-set(MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_TARGET "${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_TARGET}" CACHE INTERNAL "")
 set(MLIR_PDLL_TABLEGEN_EXE "${MLIR_PDLL_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_PDLL_TABLEGEN_TARGET "${MLIR_PDLL_TABLEGEN_TARGET}" CACHE INTERNAL "")
 
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index 971ea7eceff4c..1d2ed748bc2f1 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -136,31 +136,6 @@ function(add_mlir_pdll_library target inputFile ofn)
   add_public_tablegen_target(${target})
 endfunction()
 
-# Declare a function to generate ODS with mlir-linalg-ods-yaml-gen
-function(add_linalg_ods_yaml_gen yaml_ast_file output_file)
-  set(YAML_AST_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/${yaml_ast_file})
-  set(GEN_ODS_FILE ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.yamlgen.td)
-  set(GEN_CPP_FILE ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.yamlgen.cpp.inc)
-  set_source_files_properties(
-    ${GEN_ODS_FILE}
-    PROPERTIES GENERATED TRUE)
-  set_source_files_properties(
-    ${GEN_CPP_FILE}
-    PROPERTIES GENERATED TRUE)
-  add_custom_command(
-    OUTPUT ${GEN_ODS_FILE} ${GEN_CPP_FILE}
-    COMMAND ${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE} ${YAML_AST_SOURCE} -o-ods-decl=${GEN_ODS_FILE} -o-impl=${GEN_CPP_FILE}
-    MAIN_DEPENDENCY
-    ${YAML_AST_SOURCE}
-    DEPENDS
-    ${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE}
-    ${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_TARGET}
-    ${LLVM_TARGET_DEPENDS})
-    
-  set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT} ${GEN_ODS_FILE} ${GEN_CPP_FILE}
-      PARENT_SCOPE)
-endfunction()
-
 # Declare a dialect in the include directory
 function(add_mlir_dialect dialect dialect_namespace)
   set(LLVM_TARGET_DEFINITIONS ${dialect}.td)
diff --git a/mlir/cmake/modules/CMakeLists.txt b/mlir/cmake/modules/CMakeLists.txt
index 2b72beff89385..8d2904ef46dfe 100644
--- a/mlir/cmake/modules/CMakeLists.txt
+++ b/mlir/cmake/modules/CMakeLists.txt
@@ -38,7 +38,6 @@ set(MLIR_CONFIG_INCLUDE_DIRS
   )
 # Refer to the best host mlir-tbgen, which might be a host-optimized version
 set(MLIR_CONFIG_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}")
-set(MLIR_CONFIG_LINALG_ODS_YAML_GEN_TABLEGEN_EXE "${MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE}")
 set(MLIR_CONFIG_PDLL_TABLEGEN_EXE "${MLIR_PDLL_TABLEGEN_EXE}")
 
 configure_file(
@@ -77,7 +76,6 @@ set(MLIR_CONFIG_INCLUDE_DIRS
 # Ensure that we are using the installed mlir-tblgen. This might not be MLIR_TABLEGEN_EXE
 # if we're building with a host-optimized mlir-tblgen (with LLVM_OPTIMIZED_TABLEGEN).
 set(MLIR_CONFIG_TABLEGEN_EXE mlir-tblgen)
-set(MLIR_CONFIG_LINALG_ODS_YAML_GEN_TABLEGEN_EXE mlir-linalg-ods-yaml-gen)
 set(MLIR_CONFIG_PDLL_TABLEGEN_EXE mlir-pdll)
 
 configure_file(
diff --git a/mlir/cmake/modules/MLIRConfig.cmake.in b/mlir/cmake/modules/MLIRConfig.cmake.in
index 0922b9accea5d..d4da3cd98cce9 100644
--- a/mlir/cmake/modules/MLIRConfig.cmake.in
+++ b/mlir/cmake/modules/MLIRConfig.cmake.in
@@ -10,7 +10,6 @@ set(MLIR_EXPORTED_TARGETS "@MLIR_EXPORTS@")
 set(MLIR_CMAKE_DIR "@MLIR_CONFIG_CMAKE_DIR@")
 set(MLIR_INCLUDE_DIRS "@MLIR_CONFIG_INCLUDE_DIRS@")
 set(MLIR_TABLEGEN_EXE "@MLIR_CONFIG_TABLEGEN_EXE@")
-set(MLIR_LINALG_ODS_YAML_GEN_TABLEGEN_EXE "@MLIR_CONFIG_LINALG_ODS_YAML_GEN_TABLEGEN_EXE@")
 set(MLIR_PDLL_TABLEGEN_EXE "@MLIR_CONFIG_PDLL_TABLEGEN_EXE@")
 set(MLIR_INSTALL_AGGREGATE_OBJECTS "@MLIR_INSTALL_AGGREGATE_OBJECTS@")
 set(MLIR_ENABLE_BINDINGS_PYTHON "@MLIR_ENABLE_BINDINGS_PYTHON@")
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
index c30665d4d118b..f5d48b2ebcefe 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
@@ -1,5 +1,38 @@
+# Declare a function to generate ODS with mlir-linalg-ods-yaml-gen
+function(add_linalg_ods_yaml_gen yaml_ast_file output_file)
+  set(YAML_AST_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/${yaml_ast_file})
+  set(GEN_ODS_FILE ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.yamlgen.td)
+  set(GEN_CPP_FILE ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.yamlgen.cpp.inc)
+  set_source_files_properties(
+    ${GEN_ODS_FILE}
+    PROPERTIES GENERATED TRUE)
+  set_source_files_properties(
+    ${GEN_CPP_FILE}
+    PROPERTIES GENERATED TRUE)
+  add_custom_command(
+    OUTPUT ${GEN_ODS_FILE} ${GEN_CPP_FILE}
+    COMMAND ${MLIR_LINALG_ODS_YAML_GEN_EXE} ${YAML_AST_SOURCE} -o-ods-decl=${GEN_ODS_FILE} -o-impl=${GEN_CPP_FILE}
+    MAIN_DEPENDENCY
+    ${YAML_AST_SOURCE}
+    DEPENDS
+    ${MLIR_LINALG_ODS_YAML_GEN_EXE}
+    ${MLIR_LINALG_ODS_YAML_GEN_TARGET})
+  add_custom_target(
+    MLIR${output_file}YamlIncGen
+    DEPENDS
+    ${MLIR_LINALG_ODS_YAML_GEN_EXE}
+    ${MLIR_LINALG_ODS_YAML_GEN_TARGET}
+    ${GEN_ODS_FILE} ${GEN_CPP_FILE})
+  list(APPEND LLVM_TARGET_DEPENDS ${GEN_ODS_FILE})
+  set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} PARENT_SCOPE)
+endfunction()
+
+# NOTE: LLVM_TARGET_DEPENDS gets picked up by tablegen targets to add file
+# level dependencies. This is gross but CMake requires depending on both
+# targets and generated files, and it must be done when the custom target is
+# declared (there is no way to add after the fact).
+set(LLVM_TARGET_DEPENDS "")
 add_linalg_ods_yaml_gen(LinalgNamedStructuredOps.yaml LinalgNamedStructuredOps)
-add_public_tablegen_target(MLIRLinalgNamedStructuredOpsYamlIncGen)
 
 # Provide a short name for all external dependency that needs to
 # include Linalg in ODS
diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
index 65b65014b23cf..f0ac1899bb02a 100644
--- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
@@ -9,7 +9,6 @@ add_mlir_dialect_library(MLIRLinalgDialect
 
   DEPENDS
   MLIRLinalgInterfacesIncGen
-  MLIRLinalgNamedStructuredOpsYamlIncGen
   MLIRLinalgOpsAttributesIncGen
   MLIRLinalgOpsEnumsIncGen
   MLIRLinalgOpsIncGen
diff --git a/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt b/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
index 2b2024da6409a..787a0bb35d7b1 100644
--- a/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
+++ b/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
@@ -3,26 +3,26 @@ set(LLVM_LINK_COMPONENTS
   Support
   )
 
-set(LIBS
+# New mlir-linalg-ods-yaml-gen.
+add_mlir_tool(mlir-linalg-ods-yaml-gen
+  mlir-linalg-ods-yaml-gen.cpp
+)
+llvm_update_compile_flags(mlir-linalg-ods-yaml-gen)
+target_link_libraries(mlir-linalg-ods-yaml-gen PRIVATE
   MLIRIR
   MLIRSupport
   MLIRParser
-)
+  )
 
-# New mlir-linalg-ods-yaml-gen.
-add_tablegen(mlir-linalg-ods-yaml-gen MLIR_LINALG_ODS_YAML_GEN
-  DESTINATION "${MLIR_TOOLS_INSTALL_DIR}"
-  EXPORT MLIR
-  SKIP_COMPONENT_LINK
-  mlir-linalg-ods-yaml-gen.cpp
+setup_host_tool(mlir-linalg-ods-yaml-gen MLIR_LINALG_ODS_YAML_GEN MLIR_LINALG_ODS_YAML_GEN_EXE MLIR_LINALG_ODS_YAML_GEN_TARGET)
 
-  DEPENDS
-  ${LIBS}
-)
-set_target_properties(mlir-linalg-ods-yaml-gen PROPERTIES FOLDER "Tablegenning")
-target_link_libraries(mlir-linalg-ods-yaml-gen PRIVATE ${LIBS})
+if(NOT ${MLIR_LINALG_ODS_YAML_GEN_EXE} STREQUAL "mlir-linalg-ods-yaml-gen")
+  add_custom_target(mlir-linalg-ods-yaml-gen-host DEPENDS ${MLIR_LINALG_ODS_YAML_GEN_EXE})
 
-mlir_check_all_link_libraries(mlir-linalg-ods-yaml-gen)
+  if(NOT LLVM_BUILD_UTILS)
+    set_target_properties(mlir-linalg-ods-yaml-gen PROPERTIES EXCLUDE_FROM_ALL ON)
+  endif()
+endif()
 
 configure_file(
   update_core_linalg_named_ops.sh.in

From bdcd7c0ba032873be92bce96e02ebb82a0675616 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 3 Jan 2024 13:14:13 -0800
Subject: [PATCH 171/313] [DAGCombiner][RISCV] Preserve disjoint flag in
 folding (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) (#76860)

Since we are shifting both inputs to the original Or by the same amount
and inserting zeros in the LSBs, the result should still be disjoint.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++++-
 llvm/test/CodeGen/RISCV/mem.ll                | 3 +--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index bb44ac1fba486..464e1becc0b8c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10055,7 +10055,11 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
             DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, {N01, N1})) {
       SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
       AddToWorklist(Shl0.getNode());
-      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
+      SDNodeFlags Flags;
+      // Preserve the disjoint flag for Or.
+      if (N0.getOpcode() == ISD::OR && N0->getFlags().hasDisjoint())
+        Flags.setDisjoint(true);
+      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1, Flags);
     }
   }
 
diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll
index 8f65973d4fde9..7c98d4ae1b3f2 100644
--- a/llvm/test/CodeGen/RISCV/mem.ll
+++ b/llvm/test/CodeGen/RISCV/mem.ll
@@ -355,9 +355,8 @@ define i32 @disjoint_or_lw(ptr %a, i32 %off) nounwind {
 ; RV32I-LABEL: disjoint_or_lw:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    ori a1, a1, 12
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
 ; RV32I-NEXT:    ret
   %b = or disjoint i32 %off, 3
   %1 = getelementptr i32, ptr %a, i32 %b

From 39298b09ec99ccc300530529f254256c7178f479 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Wed, 3 Jan 2024 21:33:11 +0000
Subject: [PATCH 172/313] [mlir][docs] Capitalize "Transform" in "transform
 dialect" (#76840)

A mix of "Transform dialect" and "transform dialect" is used ATM. This
patch capitalizes the outstanding instances of "transform".
---
 mlir/docs/Dialects/Transform.md         |  8 ++++----
 mlir/docs/Tutorials/transform/Ch1.md    |  6 +++---
 mlir/docs/Tutorials/transform/Ch2.md    |  6 +++---
 mlir/docs/Tutorials/transform/Ch3.md    |  2 +-
 mlir/docs/Tutorials/transform/ChH.md    |  6 +++---
 mlir/docs/Tutorials/transform/_index.md | 10 +++++-----
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/mlir/docs/Dialects/Transform.md b/mlir/docs/Dialects/Transform.md
index 8fa7038ca4fec..67b4ee53c1b57 100644
--- a/mlir/docs/Dialects/Transform.md
+++ b/mlir/docs/Dialects/Transform.md
@@ -19,7 +19,7 @@ operations, and then applying loop unrolling to the inner loops produced by the
 previous transformations. As such, it is not intended as a replacement for the
 pass infrastructure, nor for the pattern rewriting infrastructure. In the most
 common case, the transform IR will be processed and applied to the payload IR by
-a pass. Transformations expressed by the transform dialect may be implemented
+a pass. Transformations expressed by the Transform dialect may be implemented
 using the pattern infrastructure or any other relevant MLIR component.
 
 The following IR gives a rough idea of what the operations in this dialect
@@ -271,7 +271,7 @@ operation lists.
 
 ## Handle Invalidation
 
-The execution model of the transform dialect allows a payload IR operation to be
+The execution model of the Transform dialect allows a payload IR operation to be
 associated with _multiple_ handles as well as nested payload IR operations to be
 associated with different handles. Similarly, a payload IR value may be
 associated with multiple transform IR value handles. When a transform IR
@@ -373,13 +373,13 @@ to specify which transformations the pass should run. The transform dialect
 provides a uniform, extensible mechanism for controlling transformations in
 such cases.
 
-The transform dialect is supposed to be consumed by an "interpreter" pass
+The Transform dialect is supposed to be consumed by an "interpreter" pass
 that drives the application of transformations. To ensure extensibility and
 composability, this pass is not expected to actually perform the
 transformations specified by the ops. Instead, the transformations are
 implemented by the transform ops themselves via `TransformOpInterface`. The
 pass serves as the entry point, handles the flow of transform operations and
-takes care of bookkeeping. As such, the transform dialect does not provide
+takes care of bookkeeping. As such, the Transform dialect does not provide
 the interpreter pass. Instead, it provides a set of utilities that can be
 used by clients to define their own interpreter passes or as part of a more
 complex pass. For example, the mapping between values in the transform IR
diff --git a/mlir/docs/Tutorials/transform/Ch1.md b/mlir/docs/Tutorials/transform/Ch1.md
index 95b37eb6ca413..0df25a5fbbdca 100644
--- a/mlir/docs/Tutorials/transform/Ch1.md
+++ b/mlir/docs/Tutorials/transform/Ch1.md
@@ -79,7 +79,7 @@ transform.sequence failures(propagate) {
 
 ## Transform Dialect Interpreter
 
-Since we don’t want to recompile the compiler every time we change a transformation, we can use a transform dialect interpreter pass to apply this transformation sequence to the payload IR. As we will see in the next chapter, it is possible to define custom passes or even integrate the transform interpreter into a larger pass. For now, we can use the existing test pass:
+Since we don’t want to recompile the compiler every time we change a transformation, we can use a Transform dialect interpreter pass to apply this transformation sequence to the payload IR. As we will see in the next chapter, it is possible to define custom passes or even integrate the transform interpreter into a larger pass. For now, we can use the existing test pass:
 
 
 ```sh
@@ -168,7 +168,7 @@ Besides producing new handles, the tiling transform operation _consumes_ the ope
 
 ## Handle Invalidation and Expensive Checks Mode
 
-Undefined behavior is difficult to grapple with when it does happen, so the transform dialect interpreter provides a set of additional expensive checks that detect most undefined behavior in the transform IR. For example, if we wanted to  use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode.
+Undefined behavior is difficult to grapple with when it does happen, so the Transform dialect interpreter provides a set of additional expensive checks that detect most undefined behavior in the transform IR. For example, if we wanted to  use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode.
 
 ```mlir
 transform.sequence failures(propagate) {
@@ -379,7 +379,7 @@ Finally, we would like to replace the call to the outlined function with a call
 
 ## Tracking IR Modifications
 
-The transform dialect automatically tracks all IR changes that are made as part
+The Transform dialect automatically tracks all IR changes that are made as part
 of transform ops. (Implementations must use the provided rewriter to modify IR.)
 If a payload op is erased, it is automatically removed from all handles that it
 is currently associated with. If a payload op is replaced, the transform dialect
diff --git a/mlir/docs/Tutorials/transform/Ch2.md b/mlir/docs/Tutorials/transform/Ch2.md
index 8d5076e3ef404..ac6d7d42523e4 100644
--- a/mlir/docs/Tutorials/transform/Ch2.md
+++ b/mlir/docs/Tutorials/transform/Ch2.md
@@ -10,7 +10,7 @@ The Transform dialect uses the dialect extension mechanism to allow additional o
 // In MyExtension.cpp.
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 
-// Define a new transform dialect extension. This uses the CRTP idiom to identify
+// Define a new Transform dialect extension. This uses the CRTP idiom to identify
 // extensions.
 class MyExtension : public ::mlir::transform::TransformDialectExtension<MyExtension> {
 public:
@@ -200,7 +200,7 @@ must be modified with the provided rewriter.
 ```c++
 // In MyExtension.cpp
 
-// Implementation of our transform dialect operation.
+// Implementation of our Transform dialect operation.
 // This operation returns a tri-state result that can be one of:
 // - success when the transformation succeeded;
 // - definite failure when the transformation failed in such a way that
@@ -277,7 +277,7 @@ void registerMyExtension(::mlir::DialectRegistry &registry) {
 }
 ```
 
-After registering the extension, it becomes possible to use our new operation in the transform dialect interpreter. The upstream testing pass can be used as is.
+After registering the extension, it becomes possible to use our new operation in the Transform dialect interpreter. The upstream testing pass can be used as is.
 
 ```mlir
 transform.sequence failures(propagate) {
diff --git a/mlir/docs/Tutorials/transform/Ch3.md b/mlir/docs/Tutorials/transform/Ch3.md
index 4e9d1e61e5e12..84251df383d83 100644
--- a/mlir/docs/Tutorials/transform/Ch3.md
+++ b/mlir/docs/Tutorials/transform/Ch3.md
@@ -138,7 +138,7 @@ void MyExtension::init() {
 }
 ```
 
-This type is now directly available in the transform dialect and can be used in operations.
+This type is now directly available in the Transform dialect and can be used in operations.
 
 
 ```mlir
diff --git a/mlir/docs/Tutorials/transform/ChH.md b/mlir/docs/Tutorials/transform/ChH.md
index 7c12728ac3246..f4dae5c1b99bd 100644
--- a/mlir/docs/Tutorials/transform/ChH.md
+++ b/mlir/docs/Tutorials/transform/ChH.md
@@ -1,7 +1,7 @@
 # Chapter H: Reproducing Halide Schedule
 
 This chapter demonstrates how a schedule from the [Halide
-DSL](http://halide-lang.org) can be implemented using transform dialect for
+DSL](http://halide-lang.org) can be implemented using Transform dialect for
 structured ops.
 
 Note that the IR below is pseudo-code with types removed for brevity. It may
@@ -408,7 +408,7 @@ identical_ to the code with the full schedule. Therefore, we will only unroll
 the corresponding loops corresponding to `xi` and `ci` dimensions that actually
 get unrolled by Halide.
 
-As tiling in the transform dialect produces handles to the loops materialized by
+As tiling in the Transform dialect produces handles to the loops materialized by
 tiling, unrolling those loops is just a matter of chaining the corresponding
 transformation. Note that the inner loop must be unrolled first as unrolling the
 outer loop will invalidate the handles to the inner loop.
@@ -499,7 +499,7 @@ bufferization is directly available as a transform operation.
 
 One-shot bufferization itself does not produce buffer deallocations, which may
 lead to leaks. So we have to run the buffer deallocation pass pipeline to avoid
-them. Note that the transform dialect seamlessly runs named passes and pass
+them. Note that the Transform dialect seamlessly runs named passes and pass
 pipelines: if desired, one could replace complex `--pass-pipeline expressions`
 with operations. Note that we apply the pipeline to functions rather than entire
 module to avoid running it on the transform IR that is contained in the module.
diff --git a/mlir/docs/Tutorials/transform/_index.md b/mlir/docs/Tutorials/transform/_index.md
index 3afb9c51da8b9..b508a5d1d535f 100644
--- a/mlir/docs/Tutorials/transform/_index.md
+++ b/mlir/docs/Tutorials/transform/_index.md
@@ -8,15 +8,15 @@ scheduling languages). This tutorial presents the concepts of the MLIR transform
 dialect and related infrastructure. It will be accompanied by a practical
 demonstration of three use scenarios:
 
-- Composing transform dialect operations available in (upstream) MLIR to perform
+- Composing Transform dialect operations available in (upstream) MLIR to perform
   a sequence of optimizing transformations that results in efficient code for an
   MLIR linear algebra operation.
-- Defining new transform dialect operations and adapting existing transformation
-  code to work with the transform dialect infrastructure.
-- Setting up and using the transform dialect infrastructure in a downstream
+- Defining new Transform dialect operations and adapting existing transformation
+  code to work with the Transform dialect infrastructure.
+- Setting up and using the Transform dialect infrastructure in a downstream
   out-of-tree project with custom dialects, transformations and passes.
 
-After following the tutorial, one will be able to apply the transform dialect in
+After following the tutorial, one will be able to apply the Transform dialect in
 their work and extend it when necessary. Basic familiarity with MLIR is a
 prerequisite. See [Toy tutorial](../Toy) for introduction to MLIR.
 

From 155d5849da2b2bfa2da918923d8f148a96c03e72 Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed@bougacha.org>
Date: Wed, 3 Jan 2024 13:17:11 -0800
Subject: [PATCH 173/313] [AArch64] Avoid jump tables in swiftasync
 clobber-live-reg test. NFC.

The upstream test relies on jump-tables, which are lowered in
dramatically different ways with later arm64e/ptrauth patches.

Concretely, it's failing for at least two reasons:
- ptrauth removes x16/x17 from tcGPR64 to prevent indirect tail-calls
  from using either register as the callee, conflicting with their usage
  as scratch for the tail-call LR auth checking sequence.  In the
  1/2_available_regs_left tests, this causes the MI scheduler to move
  the load up across some of the inlineasm register clobbers.

- ptrauth adds an x16/x17-using pseudo for jump-table dispatch, which
  looks somewhat different from the regular jump-table dispatch codegen
  by itself, but also prevents compression currently.

They seem like sensible changes.  But they mean the tests aren't really
testing what they're intented to, because there's always an implicit
x16/x17 clobber when using jump-tables.

This updates the test in a way that should work identically regardless
of ptrauth support, with one exception, #1 above, which merely reorders
the load/inlineasm w.r.t. eachother.
I verified the tests still fail the live-reg assertions when
applicable.
---
 ...re-swift-async-context-clobber-live-reg.ll | 486 +++++-------------
 1 file changed, 139 insertions(+), 347 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/store-swift-async-context-clobber-live-reg.ll b/llvm/test/CodeGen/AArch64/store-swift-async-context-clobber-live-reg.ll
index a202bfb6bca42..1f7584b57e4aa 100644
--- a/llvm/test/CodeGen/AArch64/store-swift-async-context-clobber-live-reg.ll
+++ b/llvm/test/CodeGen/AArch64/store-swift-async-context-clobber-live-reg.ll
@@ -1,13 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -o - -mtriple=arm64e-apple-macosx -aarch64-min-jump-table-entries=2 %s | FileCheck %s
+; RUN: llc -o - -mtriple=arm64e-apple-macosx %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
+; x16 is not available, shrink-wrapping cannot happen because
+; StoreSwiftAsyncContext needs it.
 define swifttailcc void @test_async_with_jumptable_x16_clobbered(ptr %src, ptr swiftasync %as) #0 {
 ; CHECK-LABEL: test_async_with_jumptable_x16_clobbered:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    orr x29, x29, #0x1000000000000000
-; CHECK-NEXT:    str x19, [sp, #-32]! ; 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
 ; CHECK-NEXT:    add x16, sp, #8
 ; CHECK-NEXT:    movk x16, #49946, lsl #48
@@ -18,83 +20,52 @@ define swifttailcc void @test_async_with_jumptable_x16_clobbered(ptr %src, ptr s
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_offset w19, -32
+; CHECK-NEXT:    mov x20, x22
+; CHECK-NEXT:    mov x22, x0
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov x20, x22
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x9, LJTI0_0@PAGE
-; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    add x9, x9, LJTI0_0@PAGEOFF
-; CHECK-NEXT:  Ltmp0:
-; CHECK-NEXT:    adr x10, Ltmp0
-; CHECK-NEXT:    ldrsw x11, [x9, x8, lsl #2]
-; CHECK-NEXT:    add x10, x10, x11
-; CHECK-NEXT:    mov x19, x20
-; CHECK-NEXT:    br x10
-; CHECK-NEXT:  LBB0_1: ; %then.2
-; CHECK-NEXT:    mov x19, #0 ; =0x0
-; CHECK-NEXT:    b LBB0_3
-; CHECK-NEXT:  LBB0_2: ; %then.3
-; CHECK-NEXT:    mov x19, x22
-; CHECK-NEXT:  LBB0_3: ; %exit
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    cbnz x8, LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %then.1
+; CHECK-NEXT:    str xzr, [x22]
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:  LBB0_2: ; %exit
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    bl _foo
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    mov x0, x19
-; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp], #32 ; 8-byte Folded Reload
 ; CHECK-NEXT:    and x29, x29, #0xefffffffffffffff
-; CHECK-NEXT:    br x2
-; CHECK-NEXT:    .loh AdrpAdd Lloh0, Lloh1
-; CHECK-NEXT:    .cfi_endproc
-; CHECK-NEXT:    .section __TEXT,__const
-; CHECK-NEXT:    .p2align 2, 0x0
-; CHECK-NEXT:  LJTI0_0:
-; CHECK-NEXT:    .long LBB0_3-Ltmp0
-; CHECK-NEXT:    .long LBB0_1-Ltmp0
-; CHECK-NEXT:    .long LBB0_1-Ltmp0
-; CHECK-NEXT:    .long LBB0_2-Ltmp0
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    br x1
 entry:
   %x16 = tail call i64 asm "", "={x16}"()
   %l = load i64, ptr %src, align 8
-  switch i64 %l, label %dead [
-    i64 0, label %exit
-    i64 1, label %then.1
-    i64 2, label %then.2
-    i64 3, label %then.3
-  ]
+  %c = icmp eq i64 %l, 0
+  br i1 %c, label %then.1, label %exit
 
 then.1:
+  store i64 0, ptr %src
   br label %exit
 
-then.2:
-  br label %exit
-
-then.3:
-  br label %exit
-
-dead:                                                ; preds = %entryresume.5
-  unreachable
-
 exit:
-  %p = phi ptr [ %src, %then.3 ], [ null, %then.2 ], [ %as, %entry ], [ null, %then.1 ]
+  %p = phi ptr [ %src, %then.1 ], [ %as, %entry ]
   tail call void asm sideeffect "", "{x16}"(i64 %x16)
-  %r = call i64 @foo()
+  %r = call i64 @foo(ptr %p)
   %fn = inttoptr i64 %r to ptr
-  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %p, ptr %as)
+  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %as)
   ret void
 }
 
+; x17 is not available, shrink-wrapping cannot happen because
+; StoreSwiftAsyncContext needs it.
 define swifttailcc void @test_async_with_jumptable_x17_clobbered(ptr %src, ptr swiftasync %as) #0 {
 ; CHECK-LABEL: test_async_with_jumptable_x17_clobbered:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    orr x29, x29, #0x1000000000000000
-; CHECK-NEXT:    str x19, [sp, #-32]! ; 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
 ; CHECK-NEXT:    add x16, sp, #8
 ; CHECK-NEXT:    movk x16, #49946, lsl #48
@@ -105,86 +76,61 @@ define swifttailcc void @test_async_with_jumptable_x17_clobbered(ptr %src, ptr s
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_offset w19, -32
+; CHECK-NEXT:    mov x20, x22
+; CHECK-NEXT:    mov x22, x0
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov x20, x22
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x9, LJTI1_0@PAGE
-; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    add x9, x9, LJTI1_0@PAGEOFF
-; CHECK-NEXT:  Ltmp1:
-; CHECK-NEXT:    adr x10, Ltmp1
-; CHECK-NEXT:    ldrsw x11, [x9, x8, lsl #2]
-; CHECK-NEXT:    add x10, x10, x11
-; CHECK-NEXT:    mov x19, x20
-; CHECK-NEXT:    br x10
-; CHECK-NEXT:  LBB1_1: ; %then.2
-; CHECK-NEXT:    mov x19, #0 ; =0x0
-; CHECK-NEXT:    b LBB1_3
-; CHECK-NEXT:  LBB1_2: ; %then.3
-; CHECK-NEXT:    mov x19, x22
-; CHECK-NEXT:  LBB1_3: ; %exit
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    cbnz x8, LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %then.1
+; CHECK-NEXT:    str xzr, [x22]
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:  LBB1_2: ; %exit
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    bl _foo
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    mov x0, x19
-; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp], #32 ; 8-byte Folded Reload
 ; CHECK-NEXT:    and x29, x29, #0xefffffffffffffff
-; CHECK-NEXT:    br x2
-; CHECK-NEXT:    .loh AdrpAdd Lloh2, Lloh3
-; CHECK-NEXT:    .cfi_endproc
-; CHECK-NEXT:    .section __TEXT,__const
-; CHECK-NEXT:    .p2align 2, 0x0
-; CHECK-NEXT:  LJTI1_0:
-; CHECK-NEXT:    .long LBB1_3-Ltmp1
-; CHECK-NEXT:    .long LBB1_1-Ltmp1
-; CHECK-NEXT:    .long LBB1_1-Ltmp1
-; CHECK-NEXT:    .long LBB1_2-Ltmp1
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    br x1
 entry:
   %x17 = tail call i64 asm "", "={x17}"()
   %l = load i64, ptr %src, align 8
-  switch i64 %l, label %dead [
-    i64 0, label %exit
-    i64 1, label %then.1
-    i64 2, label %then.2
-    i64 3, label %then.3
-  ]
+  %c = icmp eq i64 %l, 0
+  br i1 %c, label %then.1, label %exit
 
 then.1:
+  store i64 0, ptr %src
   br label %exit
 
-then.2:
-  br label %exit
-
-then.3:
-  br label %exit
-
-dead:                                                ; preds = %entryresume.5
-  unreachable
-
 exit:
-  %p = phi ptr [ %src, %then.3 ], [ null, %then.2 ], [ %as, %entry ], [ null, %then.1 ]
+  %p = phi ptr [ %src, %then.1 ], [ %as, %entry ]
   tail call void asm sideeffect "", "{x17}"(i64 %x17)
-  %r = call i64 @foo()
+  %r = call i64 @foo(ptr %p)
   %fn = inttoptr i64 %r to ptr
-  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %p, ptr %as)
+  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %as)
   ret void
 }
 
 define swifttailcc void @test_async_with_jumptable_x1_clobbered(ptr %src, ptr swiftasync %as) #0 {
 ; CHECK-LABEL: test_async_with_jumptable_x1_clobbered:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    mov x20, x22
+; CHECK-NEXT:    mov x22, x0
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    cbnz x8, LBB2_2
+; CHECK-NEXT:  ; %bb.1: ; %then.1
+; CHECK-NEXT:    str xzr, [x22]
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:  LBB2_2: ; %exit
 ; CHECK-NEXT:    orr x29, x29, #0x1000000000000000
-; CHECK-NEXT:    str x19, [sp, #-32]! ; 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
 ; CHECK-NEXT:    add x16, sp, #8
 ; CHECK-NEXT:    movk x16, #49946, lsl #48
@@ -195,85 +141,52 @@ define swifttailcc void @test_async_with_jumptable_x1_clobbered(ptr %src, ptr sw
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_offset w19, -32
-; CHECK-NEXT:    mov x20, x22
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    adrp x9, LJTI2_0@PAGE
-; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    add x9, x9, LJTI2_0@PAGEOFF
-; CHECK-NEXT:  Ltmp2:
-; CHECK-NEXT:    adr x10, Ltmp2
-; CHECK-NEXT:    ldrsw x11, [x9, x8, lsl #2]
-; CHECK-NEXT:    add x10, x10, x11
-; CHECK-NEXT:    mov x19, x20
-; CHECK-NEXT:    br x10
-; CHECK-NEXT:  LBB2_1: ; %then.2
-; CHECK-NEXT:    mov x19, #0 ; =0x0
-; CHECK-NEXT:    b LBB2_3
-; CHECK-NEXT:  LBB2_2: ; %then.3
-; CHECK-NEXT:    mov x19, x22
-; CHECK-NEXT:  LBB2_3: ; %exit
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    bl _foo
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    mov x0, x19
-; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp], #32 ; 8-byte Folded Reload
 ; CHECK-NEXT:    and x29, x29, #0xefffffffffffffff
-; CHECK-NEXT:    br x2
-; CHECK-NEXT:    .loh AdrpAdd Lloh4, Lloh5
-; CHECK-NEXT:    .cfi_endproc
-; CHECK-NEXT:    .section __TEXT,__const
-; CHECK-NEXT:    .p2align 2, 0x0
-; CHECK-NEXT:  LJTI2_0:
-; CHECK-NEXT:    .long LBB2_3-Ltmp2
-; CHECK-NEXT:    .long LBB2_1-Ltmp2
-; CHECK-NEXT:    .long LBB2_1-Ltmp2
-; CHECK-NEXT:    .long LBB2_2-Ltmp2
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    br x1
 entry:
   %x1 = tail call i64 asm "", "={x1}"()
   %l = load i64, ptr %src, align 8
-  switch i64 %l, label %dead [
-    i64 0, label %exit
-    i64 1, label %then.1
-    i64 2, label %then.2
-    i64 3, label %then.3
-  ]
+  %c = icmp eq i64 %l, 0
+  br i1 %c, label %then.1, label %exit
 
 then.1:
+  store i64 0, ptr %src
   br label %exit
 
-then.2:
-  br label %exit
-
-then.3:
-  br label %exit
-
-dead:                                                ; preds = %entryresume.5
-  unreachable
-
 exit:
-  %p = phi ptr [ %src, %then.3 ], [ null, %then.2 ], [ %as, %entry ], [ null, %then.1 ]
+  %p = phi ptr [ %src, %then.1 ], [ %as, %entry ]
   tail call void asm sideeffect "", "{x1}"(i64 %x1)
-  %r = call i64 @foo()
+  %r = call i64 @foo(ptr %p)
   %fn = inttoptr i64 %r to ptr
-  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %p, ptr %as)
+  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %as)
   ret void
 }
 
 define swifttailcc void @test_async_with_jumptable_x1_x9_clobbered(ptr %src, ptr swiftasync %as) #0 {
 ; CHECK-LABEL: test_async_with_jumptable_x1_x9_clobbered:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    mov x20, x22
+; CHECK-NEXT:    mov x22, x0
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    cbnz x8, LBB3_2
+; CHECK-NEXT:  ; %bb.1: ; %then.1
+; CHECK-NEXT:    str xzr, [x22]
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:  LBB3_2: ; %exit
 ; CHECK-NEXT:    orr x29, x29, #0x1000000000000000
-; CHECK-NEXT:    str x19, [sp, #-32]! ; 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
 ; CHECK-NEXT:    add x16, sp, #8
 ; CHECK-NEXT:    movk x16, #49946, lsl #48
@@ -284,76 +197,35 @@ define swifttailcc void @test_async_with_jumptable_x1_x9_clobbered(ptr %src, ptr
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_offset w19, -32
-; CHECK-NEXT:    mov x20, x22
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:    adrp x10, LJTI3_0@PAGE
-; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:    add x10, x10, LJTI3_0@PAGEOFF
-; CHECK-NEXT:  Ltmp3:
-; CHECK-NEXT:    adr x11, Ltmp3
-; CHECK-NEXT:    ldrsw x12, [x10, x8, lsl #2]
-; CHECK-NEXT:    add x11, x11, x12
-; CHECK-NEXT:    mov x19, x20
-; CHECK-NEXT:    br x11
-; CHECK-NEXT:  LBB3_1: ; %then.2
-; CHECK-NEXT:    mov x19, #0 ; =0x0
-; CHECK-NEXT:    b LBB3_3
-; CHECK-NEXT:  LBB3_2: ; %then.3
-; CHECK-NEXT:    mov x19, x22
-; CHECK-NEXT:  LBB3_3: ; %exit
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    bl _foo
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    mov x0, x19
-; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp], #32 ; 8-byte Folded Reload
 ; CHECK-NEXT:    and x29, x29, #0xefffffffffffffff
-; CHECK-NEXT:    br x2
-; CHECK-NEXT:    .loh AdrpAdd Lloh6, Lloh7
-; CHECK-NEXT:    .cfi_endproc
-; CHECK-NEXT:    .section __TEXT,__const
-; CHECK-NEXT:    .p2align 2, 0x0
-; CHECK-NEXT:  LJTI3_0:
-; CHECK-NEXT:    .long LBB3_3-Ltmp3
-; CHECK-NEXT:    .long LBB3_1-Ltmp3
-; CHECK-NEXT:    .long LBB3_1-Ltmp3
-; CHECK-NEXT:    .long LBB3_2-Ltmp3
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    br x1
 entry:
   %x1 = tail call i64 asm "", "={x1}"()
   %x9 = tail call i64 asm "", "={x9}"()
   %l = load i64, ptr %src, align 8
-  switch i64 %l, label %dead [
-    i64 0, label %exit
-    i64 1, label %then.1
-    i64 2, label %then.2
-    i64 3, label %then.3
-  ]
+  %c = icmp eq i64 %l, 0
+  br i1 %c, label %then.1, label %exit
 
 then.1:
+  store i64 0, ptr %src
   br label %exit
 
-then.2:
-  br label %exit
-
-then.3:
-  br label %exit
-
-dead:                                                ; preds = %entryresume.5
-  unreachable
-
 exit:
-  %p = phi ptr [ %src, %then.3 ], [ null, %then.2 ], [ %as, %entry ], [ null, %then.1 ]
+  %p = phi ptr [ %src, %then.1 ], [ %as, %entry ]
   tail call void asm sideeffect "", "{x1}"(i64 %x1)
   tail call void asm sideeffect "", "{x9}"(i64 %x9)
-  %r = call i64 @foo()
+  %r = call i64 @foo(ptr %p)
   %fn = inttoptr i64 %r to ptr
-  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %p, ptr %as)
+  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %as)
   ret void
 }
 
@@ -361,23 +233,8 @@ exit:
 define swifttailcc void @test_async_with_jumptable_2_available_regs_left(ptr %src, ptr swiftasync %as) #0 {
 ; CHECK-LABEL: test_async_with_jumptable_2_available_regs_left:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    orr x29, x29, #0x1000000000000000
-; CHECK-NEXT:    str x19, [sp, #-32]! ; 8-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
-; CHECK-NEXT:    add x16, sp, #8
-; CHECK-NEXT:    movk x16, #49946, lsl #48
-; CHECK-NEXT:    mov x17, x22
-; CHECK-NEXT:    pacdb x17, x16
-; CHECK-NEXT:    str x17, [sp, #8]
-; CHECK-NEXT:    add x29, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_offset w19, -32
-; CHECK-NEXT:    ; InlineAsm Start
-; CHECK-NEXT:    ; InlineAsm End
-; CHECK-NEXT:    ; InlineAsm Start
-; CHECK-NEXT:    ; InlineAsm End
+; CHECK-NEXT:    mov x20, x22
+; CHECK-NEXT:    mov x22, x0
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
@@ -404,27 +261,27 @@ define swifttailcc void @test_async_with_jumptable_2_available_regs_left(ptr %sr
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
-; CHECK-NEXT:    ldr x10, [x0]
-; CHECK-NEXT:    mov x20, x22
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:  Lloh8:
-; CHECK-NEXT:    adrp x17, LJTI4_0@PAGE
-; CHECK-NEXT:  Lloh9:
-; CHECK-NEXT:    add x17, x17, LJTI4_0@PAGEOFF
-; CHECK-NEXT:  Ltmp4:
-; CHECK-NEXT:    adr x0, Ltmp4
-; CHECK-NEXT:    ldrsw x19, [x17, x10, lsl #2]
-; CHECK-NEXT:    add x0, x0, x19
-; CHECK-NEXT:    mov x19, x20
-; CHECK-NEXT:    br x0
-; CHECK-NEXT:  LBB4_1: ; %then.2
-; CHECK-NEXT:    mov x19, #0 ; =0x0
-; CHECK-NEXT:    b LBB4_3
-; CHECK-NEXT:  LBB4_2: ; %then.3
-; CHECK-NEXT:    mov x19, x22
-; CHECK-NEXT:  LBB4_3: ; %exit
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    ldr x10, [x22]
+; CHECK-NEXT:    cbnz x10, LBB4_2
+; CHECK-NEXT:  ; %bb.1: ; %then.1
+; CHECK-NEXT:    str xzr, [x22]
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:  LBB4_2: ; %exit
+; CHECK-NEXT:    orr x29, x29, #0x1000000000000000
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    add x16, sp, #8
+; CHECK-NEXT:    movk x16, #49946, lsl #48
+; CHECK-NEXT:    mov x17, x22
+; CHECK-NEXT:    pacdb x17, x16
+; CHECK-NEXT:    str x17, [sp, #8]
+; CHECK-NEXT:    add x29, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
@@ -454,22 +311,12 @@ define swifttailcc void @test_async_with_jumptable_2_available_regs_left(ptr %sr
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    bl _foo
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    mov x0, x19
-; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp], #32 ; 8-byte Folded Reload
 ; CHECK-NEXT:    and x29, x29, #0xefffffffffffffff
-; CHECK-NEXT:    br x2
-; CHECK-NEXT:    .loh AdrpAdd Lloh8, Lloh9
-; CHECK-NEXT:    .cfi_endproc
-; CHECK-NEXT:    .section __TEXT,__const
-; CHECK-NEXT:    .p2align 2, 0x0
-; CHECK-NEXT:  LJTI4_0:
-; CHECK-NEXT:    .long LBB4_3-Ltmp4
-; CHECK-NEXT:    .long LBB4_1-Ltmp4
-; CHECK-NEXT:    .long LBB4_1-Ltmp4
-; CHECK-NEXT:    .long LBB4_2-Ltmp4
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    br x1
 entry:
   %x1 = tail call i64 asm "", "={x1}"()
   %x2 = tail call i64 asm "", "={x2}"()
@@ -485,29 +332,16 @@ entry:
   %x13 = tail call i64 asm "", "={x13}"()
   %x14 = tail call i64 asm "", "={x14}"()
   %x15 = tail call i64 asm "", "={x15}"()
-  %x16 = tail call i64 asm "", "={x16}"()
   %l = load i64, ptr %src, align 8
-  switch i64 %l, label %dead [
-    i64 0, label %exit
-    i64 1, label %then.1
-    i64 2, label %then.2
-    i64 3, label %then.3
-  ]
+  %c = icmp eq i64 %l, 0
+  br i1 %c, label %then.1, label %exit
 
 then.1:
+  store i64 0, ptr %src
   br label %exit
 
-then.2:
-  br label %exit
-
-then.3:
-  br label %exit
-
-dead:                                                ; preds = %entryresume.5
-  unreachable
-
 exit:
-  %p = phi ptr [ %src, %then.3 ], [ null, %then.2 ], [ %as, %entry ], [ null, %then.1 ]
+  %p = phi ptr [ %src, %then.1 ], [ %as, %entry ]
   tail call void asm sideeffect "", "{x1}"(i64 %x1)
   tail call void asm sideeffect "", "{x2}"(i64 %x2)
   tail call void asm sideeffect "", "{x3}"(i64 %x3)
@@ -522,37 +356,32 @@ exit:
   tail call void asm sideeffect "", "{x13}"(i64 %x13)
   tail call void asm sideeffect "", "{x14}"(i64 %x14)
   tail call void asm sideeffect "", "{x15}"(i64 %x15)
-  tail call void asm sideeffect "", "{x16}"(i64 %x16)
-  %r = call i64 @foo()
+  %r = call i64 @foo(ptr %p)
   %fn = inttoptr i64 %r to ptr
-  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %p, ptr %as)
+  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %as)
   ret void
 }
 
+
 ; There is only 1 available scratch registers left, shrink-wrapping cannot
 ; happen because StoreSwiftAsyncContext needs 2 free scratch registers.
 define swifttailcc void @test_async_with_jumptable_1_available_reg_left(ptr %src, ptr swiftasync %as) #0 {
 ; CHECK-LABEL: test_async_with_jumptable_1_available_reg_left:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    orr x29, x29, #0x1000000000000000
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    stp x21, x19, [sp, #8] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
-; CHECK-NEXT:    add x16, sp, #24
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    add x16, sp, #8
 ; CHECK-NEXT:    movk x16, #49946, lsl #48
 ; CHECK-NEXT:    mov x17, x22
 ; CHECK-NEXT:    pacdb x17, x16
-; CHECK-NEXT:    str x17, [sp, #24]
-; CHECK-NEXT:    add x29, sp, #32
+; CHECK-NEXT:    str x17, [sp, #8]
+; CHECK-NEXT:    add x29, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_offset w19, -32
-; CHECK-NEXT:    .cfi_offset w21, -40
-; CHECK-NEXT:    ; InlineAsm Start
-; CHECK-NEXT:    ; InlineAsm End
-; CHECK-NEXT:    ; InlineAsm Start
-; CHECK-NEXT:    ; InlineAsm End
+; CHECK-NEXT:    mov x20, x22
+; CHECK-NEXT:    mov x22, x0
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
@@ -581,27 +410,15 @@ define swifttailcc void @test_async_with_jumptable_1_available_reg_left(ptr %src
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
-; CHECK-NEXT:    ldr x10, [x0]
-; CHECK-NEXT:    mov x20, x22
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:  Lloh10:
-; CHECK-NEXT:    adrp x0, LJTI5_0@PAGE
-; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    add x0, x0, LJTI5_0@PAGEOFF
-; CHECK-NEXT:  Ltmp5:
-; CHECK-NEXT:    adr x21, Ltmp5
-; CHECK-NEXT:    ldrsw x19, [x0, x10, lsl #2]
-; CHECK-NEXT:    add x21, x21, x19
-; CHECK-NEXT:    mov x19, x20
-; CHECK-NEXT:    br x21
-; CHECK-NEXT:  LBB5_1: ; %then.2
-; CHECK-NEXT:    mov x19, #0 ; =0x0
-; CHECK-NEXT:    b LBB5_3
-; CHECK-NEXT:  LBB5_2: ; %then.3
-; CHECK-NEXT:    mov x19, x22
-; CHECK-NEXT:  LBB5_3: ; %exit
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    ldr x10, [x22]
+; CHECK-NEXT:    cbnz x10, LBB5_2
+; CHECK-NEXT:  ; %bb.1: ; %then.1
+; CHECK-NEXT:    str xzr, [x22]
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:  LBB5_2: ; %exit
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
@@ -633,23 +450,12 @@ define swifttailcc void @test_async_with_jumptable_1_available_reg_left(ptr %src
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    bl _foo
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    mov x0, x19
-; CHECK-NEXT:    mov x1, x20
-; CHECK-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp x21, x19, [sp, #8] ; 16-byte Folded Reload
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    and x29, x29, #0xefffffffffffffff
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    br x2
-; CHECK-NEXT:    .loh AdrpAdd Lloh10, Lloh11
-; CHECK-NEXT:    .cfi_endproc
-; CHECK-NEXT:    .section __TEXT,__const
-; CHECK-NEXT:    .p2align 2, 0x0
-; CHECK-NEXT:  LJTI5_0:
-; CHECK-NEXT:    .long LBB5_3-Ltmp5
-; CHECK-NEXT:    .long LBB5_1-Ltmp5
-; CHECK-NEXT:    .long LBB5_1-Ltmp5
-; CHECK-NEXT:    .long LBB5_2-Ltmp5
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    br x1
 entry:
   %x1 = tail call i64 asm "", "={x1}"()
   %x2 = tail call i64 asm "", "={x2}"()
@@ -666,29 +472,16 @@ entry:
   %x14 = tail call i64 asm "", "={x14}"()
   %x15 = tail call i64 asm "", "={x15}"()
   %x16 = tail call i64 asm "", "={x16}"()
-  %x17 = tail call i64 asm "", "={x17}"()
   %l = load i64, ptr %src, align 8
-  switch i64 %l, label %dead [
-    i64 0, label %exit
-    i64 1, label %then.1
-    i64 2, label %then.2
-    i64 3, label %then.3
-  ]
+  %c = icmp eq i64 %l, 0
+  br i1 %c, label %then.1, label %exit
 
 then.1:
+  store i64 0, ptr %src
   br label %exit
 
-then.2:
-  br label %exit
-
-then.3:
-  br label %exit
-
-dead:                                                ; preds = %entryresume.5
-  unreachable
-
 exit:
-  %p = phi ptr [ %src, %then.3 ], [ null, %then.2 ], [ %as, %entry ], [ null, %then.1 ]
+  %p = phi ptr [ %src, %then.1 ], [ %as, %entry ]
   tail call void asm sideeffect "", "{x1}"(i64 %x1)
   tail call void asm sideeffect "", "{x2}"(i64 %x2)
   tail call void asm sideeffect "", "{x3}"(i64 %x3)
@@ -704,13 +497,12 @@ exit:
   tail call void asm sideeffect "", "{x14}"(i64 %x14)
   tail call void asm sideeffect "", "{x15}"(i64 %x15)
   tail call void asm sideeffect "", "{x16}"(i64 %x16)
-  tail call void asm sideeffect "", "{x17}"(i64 %x17)
-  %r = call i64 @foo()
+  %r = call i64 @foo(ptr %p)
   %fn = inttoptr i64 %r to ptr
-  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %p, ptr %as)
+  musttail call swifttailcc void %fn(ptr swiftasync %src, ptr %as)
   ret void
 }
 
-declare i64 @foo()
+declare i64 @foo(ptr)
 
 attributes #0 = { "frame-pointer"="non-leaf" }

From 09de5e5c6d8c25fe76840c69056afca72275dd6b Mon Sep 17 00:00:00 2001
From: Yuxuan Chen <yuxuanchen1997@outlook.com>
Date: Wed, 3 Jan 2024 14:43:10 -0800
Subject: [PATCH 174/313] [Clang] Fix failing CI with different test case
 attribute & host triple (#76863)

As in title,
https://github.com/llvm/llvm-project/commit/a8f43974260ec244d78336d2530f8fc097753580
broke CI due to the calling convention not available on certain targets.
This patch uses a simpler calling convention and enables the test only
when the attribute exists. It's verified that this test crashes the
compiler before a8f43974260ec244d78336d2530f8fc097753580 so it's the
same effect as the previous test. Disabling the test on platforms that
don't have the calling convention is fine because it's guarding against
a frontend bug.
---
 clang/test/SemaCXX/template-instantiation.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/SemaCXX/template-instantiation.cpp b/clang/test/SemaCXX/template-instantiation.cpp
index e714e070a206f..8543af0d5428d 100644
--- a/clang/test/SemaCXX/template-instantiation.cpp
+++ b/clang/test/SemaCXX/template-instantiation.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -verify -fsyntax-only -Wno-ignored-attributes %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -verify -fsyntax-only %s
 // expected-no-diagnostics
 
 namespace GH76521 {
 
 template <typename T>
 void foo() {
-  auto l = []() __attribute__((pcs("aapcs-vfp"))) {};
+  auto l = []() __attribute__((preserve_most)) {};
 }
 
 void bar() {

From 49029f926d359075d59ad4aec2d01a21d9514b02 Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Wed, 3 Jan 2024 15:02:37 -0800
Subject: [PATCH 175/313] [lldb] Fix breakpoint resolver serialization bug
 (#76766)

BreakpointResolverAddress optionally can include the module name related
to the address that gets resolved. Currently this will never work
because it sets the name to itself (which is empty).
---
 .../Breakpoint/BreakpointResolverAddress.cpp  | 12 +++----
 .../serialize/TestBreakpointSerialization.py  | 36 +++++++++++++++++++
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/lldb/source/Breakpoint/BreakpointResolverAddress.cpp b/lldb/source/Breakpoint/BreakpointResolverAddress.cpp
index a0c628a8e299c..ee4cbd50f9eee 100644
--- a/lldb/source/Breakpoint/BreakpointResolverAddress.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverAddress.cpp
@@ -65,13 +65,11 @@ BreakpointResolverAddress::SerializeToStructuredData() {
       new StructuredData::Dictionary());
   SectionSP section_sp = m_addr.GetSection();
   if (section_sp) {
-    ModuleSP module_sp = section_sp->GetModule();
-    ConstString module_name;
-    if (module_sp)
-      module_name.SetCString(module_name.GetCString());
-
-    options_dict_sp->AddStringItem(GetKey(OptionNames::ModuleName),
-                                   module_name.GetCString());
+    if (ModuleSP module_sp = section_sp->GetModule()) {
+      const FileSpec &module_fspec = module_sp->GetFileSpec();
+      options_dict_sp->AddStringItem(GetKey(OptionNames::ModuleName),
+                                     module_fspec.GetPath().c_str());
+    }
     options_dict_sp->AddIntegerItem(GetKey(OptionNames::AddressOffset),
                                     m_addr.GetOffset());
   } else {
diff --git a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
index be9b4e587b296..b6cc3d9989a69 100644
--- a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
+++ b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
@@ -49,6 +49,42 @@ def test_scripted_extra_args(self):
         self.setup_targets_and_cleanup()
         self.do_check_extra_args()
 
+    def test_resolver_serialization(self):
+        """Test that breakpoint resolvers contain the expected information"""
+        self.build()
+        self.setup_targets_and_cleanup()
+
+        exe_path = self.getBuildArtifact("a.out")
+        exe_module = self.orig_target.module[exe_path]
+        self.assertTrue(
+            exe_module.IsValid(), "Failed to find the executable module in target"
+        )
+        sym_ctx_list = exe_module.FindFunctions("main")
+        self.assertTrue(sym_ctx_list.GetSize() == 1, "Unable to find function 'main'")
+        sym_ctx = sym_ctx_list.GetContextAtIndex(0)
+        self.assertTrue(
+            sym_ctx.IsValid(), "SBSymbolContext representing function 'main' is invalid"
+        )
+        main_func = sym_ctx.GetFunction()
+        self.assertTrue(
+            main_func.IsValid(), "SBFunction representing 'main' is invalid"
+        )
+        main_addr = main_func.GetStartAddress()
+
+        bkpt = self.orig_target.BreakpointCreateBySBAddress(main_addr)
+        self.assertTrue(
+            bkpt.IsValid(), "Could not place breakpoint on 'main' by address"
+        )
+        stream = lldb.SBStream()
+        sd = bkpt.SerializeToStructuredData()
+        sd.GetAsJSON(stream)
+        serialized_data = json.loads(stream.GetData())
+
+        self.assertIn(
+            exe_path,
+            serialized_data["Breakpoint"]["BKPTResolver"]["Options"]["ModuleName"],
+        )
+
     def test_structured_data_serialization(self):
         target = self.dbg.GetDummyTarget()
         self.assertTrue(target.IsValid(), VALID_TARGET)

From 49b492048af2b2093aaed899c0bbd6d740aad83c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 4 Jan 2024 00:10:15 +0100
Subject: [PATCH 176/313] AMDGPU: Fix packed 16-bit inline constants (#76522)

Consistently treat packed 16-bit operands as 32-bit values, because
that's really what they are. The attempt to treat them differently was
ultimately incorrect and lead to miscompiles, e.g. when using non-splat
constants such as (1, 0) as operands.

Recognize 32-bit float constants for i/u16 instructions. This is a bit
odd conceptually, but it matches HW behavior and SP3.

Remove isFoldableLiteralV216; there was too much magic in the dependency
between it and its use in SIFoldOperands. Instead, we now simply rely on
checking whether a constant is an inline constant, and trying a bunch of
permutations of the low and high halves. This is more obviously correct
and leads to some new cases where inline constants are used as shown by
tests.

Move the logic for switching packed add vs. sub into SIFoldOperands.
This has two benefits: all logic that optimizes for inline constants in
packed math is now in one place; and it applies to both SelectionDAG and
GISel paths.

Disable the use of opsel with v_dot* instructions on gfx11. They are
documented to ignore opsel on src0 and src1. It may be interesting to
re-enable to use of opsel on src2 as a future optimization.

A similar "proper" fix of what inline constants mean could potentially
be applied to unpacked 16-bit ops. However, it's less clear what the
benefit would be, and there are surely places where we'd have to
carefully audit whether values are properly sign- or zero-extended. It
is best to keep such a change separate.

Fixes: Corruption in FSR 2.0 (latent bug exposed by an LLPC change)
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   20 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   14 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |   21 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |    6 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    2 +-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |  125 +-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |    6 +-
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp      |   19 +-
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  148 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   12 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   17 -
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |    4 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  106 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   11 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |    9 -
 .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll    |   67 +-
 .../CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll    |    4 +-
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |   73 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     |    4 +-
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |   40 +-
 llvm/test/CodeGen/AMDGPU/fma.f16.ll           |    8 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     |  145 ++-
 llvm/test/CodeGen/AMDGPU/fptosi.f16.ll        |    4 +-
 llvm/test/CodeGen/AMDGPU/immv216.ll           |   24 +-
 .../CodeGen/AMDGPU/integer-mad-patterns.ll    |  813 +++++-------
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll     |   29 +-
 .../test/CodeGen/AMDGPU/pk_max_f16_literal.ll |    4 +-
 .../AMDGPU/reassoc-mul-add-1-to-mad.ll        |   16 +-
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 1093 ++++++-----------
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     |   16 +-
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |   13 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |   46 +-
 llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s         |   59 +-
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s         |   57 +-
 llvm/test/MC/AMDGPU/literalv216.s             |   20 +-
 .../AMDGPU/gfx10_vop3p_literalv216.txt        |    2 +-
 .../Disassembler/AMDGPU/gfx11_dasm_vop3p.txt  |   56 +-
 .../Disassembler/AMDGPU/gfx12_dasm_vop3p.txt  |   56 +-
 .../MC/Disassembler/AMDGPU/gfx9_vop3p.txt     |  120 +-
 39 files changed, 1541 insertions(+), 1748 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bffea82ab8f49..48ee0d942867e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -317,26 +317,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
   }
 }
 
-bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
-                                           bool Negated) const {
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
   if (N->isUndef())
     return true;
 
   const SIInstrInfo *TII = Subtarget->getInstrInfo();
-  if (Negated) {
-    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
-      return TII->isInlineConstant(-C->getAPIntValue());
+  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+    return TII->isInlineConstant(C->getAPIntValue());
 
-    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
-      return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
-
-  } else {
-    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
-      return TII->isInlineConstant(C->getAPIntValue());
-
-    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
-      return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
-  }
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+    return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
 
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 374108af08cd5..df4a211d42a09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -50,15 +50,13 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
 }
 
 // TODO: Handle undef as zero
-static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
-                                        bool Negate = false) {
+static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
   uint32_t LHSVal, RHSVal;
   if (getConstantValue(N->getOperand(0), LHSVal) &&
       getConstantValue(N->getOperand(1), RHSVal)) {
     SDLoc SL(N);
-    uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16)
-                        : (LHSVal & 0xffff) | (RHSVal << 16);
+    uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
     return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
                               DAG.getTargetConstant(K, SL, MVT::i32));
   }
@@ -66,9 +64,6 @@ static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
   return nullptr;
 }
 
-static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
-  return packConstantV2I16(N, DAG, true);
-}
 } // namespace
 
 /// AMDGPU specific code to select AMDGPU machine instructions for
@@ -110,10 +105,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
 private:
   std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
-  bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
-  bool isNegInlineImmediate(const SDNode *N) const {
-    return isInlineImmediate(N, true);
-  }
+  bool isInlineImmediate(const SDNode *N) const;
 
   bool isInlineImmediate16(int64_t Imm) const {
     return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5f2b7c0b4b4ef..b7f0438601159 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1865,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_IMM_V2FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
   case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
   case AMDGPU::OPERAND_KIMM32:
   case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
     return &APFloat::IEEEsingle();
@@ -1879,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
-  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
-  case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_KIMM16:
     return &APFloat::IEEEhalf();
@@ -2033,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
   // We allow fp literals with f16x2 operands assuming that the specified
   // literal goes into the lower half and the upper half is zero. We also
   // require that the literal may be losslessly converted to f16.
-  MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
-                     (type == MVT::v2i16)? MVT::i16 :
-                     (type == MVT::v2f32)? MVT::f32 : type;
+  //
+  // For i16x2 operands, we assume that the specified literal is encoded as a
+  // single-precision float. This is pretty odd, but it matches SP3 and what
+  // happens in hardware.
+  MVT ExpectedType = (type == MVT::v2f16)   ? MVT::f16
+                     : (type == MVT::v2i16) ? MVT::f32
+                     : (type == MVT::v2f32) ? MVT::f32
+                                            : type;
 
   APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
   return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
@@ -3401,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
     if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
         OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
         OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16)
-      return AMDGPU::isInlinableIntLiteralV216(Val);
+      return AMDGPU::isInlinableLiteralV2I16(Val);
 
     if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
         OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
         OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
-      return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
+      return AMDGPU::isInlinableLiteralV2F16(Val);
 
     return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
   }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 67be7b0fd6421..9dff3f6c2efd0 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
   DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm,      \
                false, ImmWidth)
 
+#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth)         \
+  DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth)
+
 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
 // and decode using 'enum10' from decodeSrcOp.
 #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth)          \
@@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
 
+DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16)
+
 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 91a7093032695..b85eb76aca266 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1096,7 +1096,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
 
   // Cannot use op_sel with v_dot instructions.
-  bool hasDOTOpSelHazard() const { return GFX940Insts; }
+  bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
 
   // Does not have HW interlocs for VALU writing and then reading SGPRs.
   bool hasVDecCoExecHazard() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ef1b85f58d0ee..6c7977e22599c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
   }
 }
 
-void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
-                                         const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
-  int16_t SImm = static_cast<int16_t>(Imm);
-  if (isInlinableIntLiteral(SImm)) {
-    O << SImm;
-    return;
-  }
-
+// This must accept a 32-bit immediate value to correctly handle packed 16-bit
+// operations.
+static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
   if (Imm == 0x3C00)
-    O<< "1.0";
+    O << "1.0";
   else if (Imm == 0xBC00)
-    O<< "-1.0";
+    O << "-1.0";
   else if (Imm == 0x3800)
-    O<< "0.5";
+    O << "0.5";
   else if (Imm == 0xB800)
-    O<< "-0.5";
+    O << "-0.5";
   else if (Imm == 0x4000)
-    O<< "2.0";
+    O << "2.0";
   else if (Imm == 0xC000)
-    O<< "-2.0";
+    O << "-2.0";
   else if (Imm == 0x4400)
-    O<< "4.0";
+    O << "4.0";
   else if (Imm == 0xC400)
-    O<< "-4.0";
-  else if (Imm == 0x3118 &&
-           STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) {
+    O << "-4.0";
+  else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     O << "0.15915494";
-  } else {
-    uint64_t Imm16 = static_cast<uint16_t>(Imm);
-    O << formatHex(Imm16);
-  }
-}
+  else
+    return false;
 
-void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
-                                           const MCSubtargetInfo &STI,
-                                           raw_ostream &O) {
-  uint16_t Lo16 = static_cast<uint16_t>(Imm);
-  printImmediate16(Lo16, STI, O);
+  return true;
 }
 
-void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
+  int16_t SImm = static_cast<int16_t>(Imm);
+  if (isInlinableIntLiteral(SImm)) {
+    O << SImm;
+    return;
+  }
+
+  uint16_t HImm = static_cast<uint16_t>(Imm);
+  if (printImmediateFloat16(HImm, STI, O))
+    return;
+
+  uint64_t Imm16 = static_cast<uint16_t>(Imm);
+  O << formatHex(Imm16);
+}
+
+void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
   int32_t SImm = static_cast<int32_t>(Imm);
-  if (SImm >= -16 && SImm <= 64) {
+  if (isInlinableIntLiteral(SImm)) {
     O << SImm;
     return;
   }
 
+  switch (OpType) {
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+    if (printImmediateFloat32(Imm, STI, O))
+      return;
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+    if (isUInt<16>(Imm) &&
+        printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O))
+      return;
+    break;
+  default:
+    llvm_unreachable("bad operand type");
+  }
+
+  O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm,
+                                              const MCSubtargetInfo &STI,
+                                              raw_ostream &O) {
   if (Imm == llvm::bit_cast<uint32_t>(0.0f))
     O << "0.0";
   else if (Imm == llvm::bit_cast<uint32_t>(1.0f))
@@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
            STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     O << "0.15915494";
   else
-    O << formatHex(static_cast<uint64_t>(Imm));
+    return false;
+
+  return true;
+}
+
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  int32_t SImm = static_cast<int32_t>(Imm);
+  if (isInlinableIntLiteral(SImm)) {
+    O << SImm;
+    return;
+  }
+
+  if (printImmediateFloat32(Imm, STI, O))
+    return;
+
+  O << formatHex(static_cast<uint64_t>(Imm));
 }
 
 void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
@@ -755,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       break;
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
     case AMDGPU::OPERAND_REG_IMM_V2FP16:
-      if (!isUInt<16>(Op.getImm()) &&
-          STI.hasFeature(AMDGPU::FeatureVOP3Literal)) {
-        printImmediate32(Op.getImm(), STI, O);
-        break;
-      }
-
-      //  Deal with 16-bit FP inline immediates not working.
-      if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) {
-        printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O);
-        break;
-      }
-      [[fallthrough]];
     case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
     case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
-      printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O);
-      break;
     case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
-      printImmediateV216(Op.getImm(), STI, O);
+      printImmediateV216(Op.getImm(), OpTy, STI, O);
       break;
     case MCOI::OPERAND_UNKNOWN:
     case MCOI::OPERAND_PCREL:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index f2f985fa5b1a8..e3958f88277da 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -88,8 +88,10 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                            raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
-  void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
-                          raw_ostream &O);
+  void printImmediateV216(uint32_t Imm, uint8_t OpType,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI,
+                             raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index b403d69d9ff13..de1abaf29c56b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
     // which does not have f16 support?
     return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
-  case AMDGPU::OPERAND_REG_IMM_V2FP16: {
-    if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal))
-      return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
-    if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
-      return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
-    [[fallthrough]];
-  }
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
-    return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+    return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm))
+        .value_or(255);
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
-    uint16_t Lo16 = static_cast<uint16_t>(Imm);
-    uint32_t Encoding = getLit16Encoding(Lo16, STI);
-    return Encoding;
-  }
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+    return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
+        .value_or(255);
   case AMDGPU::OPERAND_KIMM32:
   case AMDGPU::OPERAND_KIMM16:
     return MO.getImm();
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 709de612d81d4..aa7639a0f1866 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
   assert(Old.isReg() && Fold.isImm());
 
   if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
-      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
-      isUInt<16>(Fold.ImmToFold) ||
-      !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
     return false;
 
   unsigned Opcode = MI->getOpcode();
@@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
   unsigned Opcode = MI->getOpcode();
   int OpNo = MI->getOperandNo(&Old);
+  uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+
+  // If the literal can be inlined as-is, apply it and short-circuit the
+  // tests below. The main motivation for this is to avoid unintuitive
+  // uses of opsel.
+  if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
 
-  // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
-  // already set.
+  // Refer to op_sel/op_sel_hi and check if we can change the immediate and
+  // op_sel in a way that allows an inline constant.
   int ModIdx = -1;
-  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+  unsigned SrcIdx = ~0;
+  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
     ModIdx = AMDGPU::OpName::src0_modifiers;
-  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+    SrcIdx = 0;
+  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
     ModIdx = AMDGPU::OpName::src1_modifiers;
-  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+    SrcIdx = 1;
+  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
     ModIdx = AMDGPU::OpName::src2_modifiers;
+    SrcIdx = 2;
+  }
   assert(ModIdx != -1);
   ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
   MachineOperand &Mod = MI->getOperand(ModIdx);
-  unsigned Val = Mod.getImm();
-  if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+  unsigned ModVal = Mod.getImm();
+
+  uint16_t ImmLo = static_cast<uint16_t>(
+      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
+  uint16_t ImmHi = static_cast<uint16_t>(
+      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
+  uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
+  unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+
+  // Helper function that attempts to inline the given value with a newly
+  // chosen opsel pattern.
+  auto tryFoldToInline = [&](uint32_t Imm) -> bool {
+    if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
+      Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
+      Old.ChangeToImmediate(Imm);
+      return true;
+    }
+
+    // Try to shuffle the halves around and leverage opsel to get an inline
+    // constant.
+    uint16_t Lo = static_cast<uint16_t>(Imm);
+    uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
+    if (Lo == Hi) {
+      if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
+        Mod.setImm(NewModVal);
+        Old.ChangeToImmediate(Lo);
+        return true;
+      }
+
+      if (static_cast<int16_t>(Lo) < 0) {
+        int32_t SExt = static_cast<int16_t>(Lo);
+        if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
+          Mod.setImm(NewModVal);
+          Old.ChangeToImmediate(SExt);
+          return true;
+        }
+      }
+
+      // This check is only useful for integer instructions
+      if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
+          OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) {
+        if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
+          Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+          Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
+          return true;
+        }
+      }
+    } else {
+      uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
+      if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
+        Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
+        Old.ChangeToImmediate(Swapped);
+        return true;
+      }
+    }
+
     return false;
+  };
 
-  // Only apply the following transformation if that operand requires
-  // a packed immediate.
-  // If upper part is all zero we do not need op_sel_hi.
-  if (!(Fold.ImmToFold & 0xffff)) {
-    MachineOperand New =
-        MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
-    if (!TII->isOperandLegal(*MI, OpNo, &New))
-      return false;
-    Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
-    Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-    Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+  if (tryFoldToInline(Imm))
     return true;
+
+  // Replace integer addition by subtraction and vice versa if it allows
+  // folding the immediate to an inline constant.
+  //
+  // We should only ever get here for SrcIdx == 1 due to canonicalization
+  // earlier in the pipeline, but we double-check here to be safe / fully
+  // general.
+  bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
+  bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
+  if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
+    unsigned ClampIdx =
+        AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
+    bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
+
+    if (!Clamp) {
+      uint16_t NegLo = -static_cast<uint16_t>(Imm);
+      uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
+      uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
+
+      if (tryFoldToInline(NegImm)) {
+        unsigned NegOpcode =
+            IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
+        MI->setDesc(TII->get(NegOpcode));
+        return true;
+      }
+    }
   }
-  MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
-  if (!TII->isOperandLegal(*MI, OpNo, &New))
-    return false;
-  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-  Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
-  return true;
+
+  return false;
 }
 
 bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
@@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
   assert(Old.isReg());
 
-  if (Fold.isImm() && canUseImmWithOpSel(Fold))
-    return tryFoldImmWithOpSel(Fold);
+  if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
+    if (tryFoldImmWithOpSel(Fold))
+      return true;
+
+    // We can't represent the candidate as an inline constant. Try as a literal
+    // with the original opsel, checking constant bus limitations.
+    MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold);
+    int OpNo = MI->getOperandNo(&Old);
+    if (!TII->isOperandLegal(*MI, OpNo, &New))
+      return false;
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
 
   if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
     MachineBasicBlock *MBB = MI->getParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 396d22c7ec18d..67992929ab356 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4153,15 +4153,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
-    return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
-           AMDGPU::isInlinableIntLiteral((int16_t)Imm);
+    return AMDGPU::isInlinableLiteralV2I16(Imm);
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+    return AMDGPU::isInlinableLiteralV2F16(Imm);
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
-  case AMDGPU::OPERAND_REG_IMM_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
       // A few special case instructions have 16-bit operands on subtargets
       // where 16-bit instructions are not legal.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 50724fd3c1cf1..f07b8fa0ea4cd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -860,23 +860,6 @@ def ShiftAmt32Imm : ImmLeaf <i32, [{
   return Imm < 32;
 }]>;
 
-def getNegV2I16Imm : SDNodeXForm<build_vector, [{
-  return SDValue(packNegConstantV2I16(N, *CurDAG), 0);
-}]>;
-
-def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
-  assert(N->getNumOperands() == 2);
-  assert(N->getOperand(0).getValueType().getSizeInBits() == 16);
-  SDValue Src0 = N->getOperand(0);
-  SDValue Src1 = N->getOperand(1);
-  if (Src0 == Src1)
-    return isNegInlineImmediate(Src0.getNode());
-
-  return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) ||
-         (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
-}], getNegV2I16Imm>;
-
-
 def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
   return fp16SrcZerosHighBits(N->getOpcode());
 }]>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c94b894c5841f..1d197dc08ac2a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1152,11 +1152,11 @@ class RegOrF32 <string RegisterClass, string OperandTypePrefix>
 
 class RegOrV2B16 <string RegisterClass, string OperandTypePrefix>
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16",
-                     !subst("_v2b16", "V2B16", NAME), "_Imm16">;
+                     !subst("_v2b16", "V2B16", NAME), "_ImmV2I16">;
 
 class RegOrV2F16 <string RegisterClass, string OperandTypePrefix>
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16",
-                     !subst("_v2f16", "V2F16", NAME), "_Imm16">;
+                     !subst("_v2f16", "V2F16", NAME), "_ImmV2F16">;
 
 class RegOrF64 <string RegisterClass, string OperandTypePrefix>
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64",
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a91d77175234f..26ba2575ff34a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2506,53 +2506,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
          Val == 0x3118;   // 1/2pi
 }
 
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
-  assert(HasInv2Pi);
-
-  if (isInt<16>(Literal) || isUInt<16>(Literal)) {
-    int16_t Trunc = static_cast<int16_t>(Literal);
-    return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi);
+std::optional<unsigned> getInlineEncodingV216(bool IsFloat, uint32_t Literal) {
+  // Unfortunately, the Instruction Set Architecture Reference Guide is
+  // misleading about how the inline operands work for (packed) 16-bit
+  // instructions. In a nutshell, the actual HW behavior is:
+  //
+  //  - integer encodings (-16 .. 64) are always produced as sign-extended
+  //    32-bit values
+  //  - float encodings are produced as:
+  //    - for F16 instructions: corresponding half-precision float values in
+  //      the LSBs, 0 in the MSBs
+  //    - for UI16 instructions: corresponding single-precision float value
+  int32_t Signed = static_cast<int32_t>(Literal);
+  if (Signed >= 0 && Signed <= 64)
+    return 128 + Signed;
+
+  if (Signed >= -16 && Signed <= -1)
+    return 192 + std::abs(Signed);
+
+  if (IsFloat) {
+    // clang-format off
+    switch (Literal) {
+    case 0x3800: return 240; // 0.5
+    case 0xB800: return 241; // -0.5
+    case 0x3C00: return 242; // 1.0
+    case 0xBC00: return 243; // -1.0
+    case 0x4000: return 244; // 2.0
+    case 0xC000: return 245; // -2.0
+    case 0x4400: return 246; // 4.0
+    case 0xC400: return 247; // -4.0
+    case 0x3118: return 248; // 1.0 / (2.0 * pi)
+    default: break;
+    }
+    // clang-format on
+  } else {
+    // clang-format off
+    switch (Literal) {
+    case 0x3F000000: return 240; // 0.5
+    case 0xBF000000: return 241; // -0.5
+    case 0x3F800000: return 242; // 1.0
+    case 0xBF800000: return 243; // -1.0
+    case 0x40000000: return 244; // 2.0
+    case 0xC0000000: return 245; // -2.0
+    case 0x40800000: return 246; // 4.0
+    case 0xC0800000: return 247; // -4.0
+    case 0x3E22F983: return 248; // 1.0 / (2.0 * pi)
+    default: break;
+    }
+    // clang-format on
   }
-  if (!(Literal & 0xffff))
-    return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi);
 
-  int16_t Lo16 = static_cast<int16_t>(Literal);
-  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
-  return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
+  return {};
 }
 
-bool isInlinableIntLiteralV216(int32_t Literal) {
-  int16_t Lo16 = static_cast<int16_t>(Literal);
-  if (isInt<16>(Literal) || isUInt<16>(Literal))
-    return isInlinableIntLiteral(Lo16);
+// Encoding of the literal as an inline constant for a V_PK_*_IU16 instruction
+// or nullopt.
+std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal) {
+  return getInlineEncodingV216(false, Literal);
+}
 
-  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
-  if (!(Literal & 0xffff))
-    return isInlinableIntLiteral(Hi16);
-  return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
+// Encoding of the literal as an inline constant for a V_PK_*_F16 instruction
+// or nullopt.
+std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) {
+  return getInlineEncodingV216(true, Literal);
 }
 
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) {
+// Whether the given literal can be inlined for a V_PK_* instruction.
+bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
   switch (OpType) {
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+    return getInlineEncodingV216(false, Literal).has_value();
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-    return isInlinableLiteralV216(Literal, HasInv2Pi);
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+    return getInlineEncodingV216(true, Literal).has_value();
   default:
-    return isInlinableIntLiteralV216(Literal);
+    llvm_unreachable("bad packed operand type");
   }
 }
 
-bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
-  assert(HasInv2Pi);
-
-  int16_t Lo16 = static_cast<int16_t>(Literal);
-  if (isInt<16>(Literal) || isUInt<16>(Literal))
-    return true;
+// Whether the given literal can be inlined for a V_PK_*_IU16 instruction.
+bool isInlinableLiteralV2I16(uint32_t Literal) {
+  return getInlineEncodingV2I16(Literal).has_value();
+}
 
-  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
-  if (!(Literal & 0xffff))
-    return true;
-  return Lo16 == Hi16;
+// Whether the given literal can be inlined for a V_PK_*_F16 instruction.
+bool isInlinableLiteralV2F16(uint32_t Literal) {
+  return getInlineEncodingV2F16(Literal).has_value();
 }
 
 bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3c9f330cbcded..50c741760d714 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1291,16 +1291,19 @@ LLVM_READNONE
 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal);
 
 LLVM_READNONE
-bool isInlinableIntLiteralV216(int32_t Literal);
+std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal);
 
 LLVM_READNONE
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType);
+bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType);
 
 LLVM_READNONE
-bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
+bool isInlinableLiteralV2I16(uint32_t Literal);
+
+LLVM_READNONE
+bool isInlinableLiteralV2F16(uint32_t Literal);
 
 LLVM_READNONE
 bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7f52501b5d903..985b77be1d881 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -125,15 +125,6 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2
 
 let SubtargetPredicate = HasVOP3PInsts in {
 
-// Undo sub x, c -> add x, -c canonicalization since c is more likely
-// an inline immediate than -c.
-// The constant will be emitted as a mov, and folded later.
-// TODO: We could directly encode the immediate now
-def : GCNPat<
-  (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1),
-  (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1)
->;
-
 // Integer operations with clamp bit set.
 class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat<
   (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index e4cababfe1c91..496ee9f2dbb2b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -172,8 +172,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc0ffc0
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 64 op_sel_hi:[1,0]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
@@ -188,7 +187,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v0, 0xffc0, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_sub_u16 v0, v0, 64 op_sel_hi:[1,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %a, <i16 -64, i16 -64>
   ret <2 x i16> %add
@@ -609,3 +608,65 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
   %cast = bitcast <2 x i16> %add to i32
   ret i32 %cast
 }
+
+define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
+; GFX7-LABEL: add_inline_imm_neg1_0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: add_inline_imm_neg1_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: add_inline_imm_neg1_0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_add_u16_e32 v0, -1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: add_inline_imm_neg1_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_u16 v0, v0, 1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %y = add <2 x i16> %x, <i16 -1, i16 0>
+  ret <2 x i16> %y
+}
+
+define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
+; GFX7-LABEL: add_inline_imm_1_0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: add_inline_imm_1_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: add_inline_imm_1_0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_add_u16_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: add_inline_imm_1_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %y = add <2 x i16> %x, <i16 1, i16 0>
+  ret <2 x i16> %y
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
index aa7aa6b21a562..5613501736ff0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
@@ -156,13 +156,13 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0ffc0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_splat:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0ffc0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i16> %a, <i16 -64, i16 -64>
   ret <2 x i16> %sub
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index b90d68a83b30a..7cf58a214245d 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -437,7 +437,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, -1
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -449,7 +449,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, -1
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -460,7 +460,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, -1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -566,8 +566,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s2, 1.0
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s2
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1.0
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -579,7 +578,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1.0
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -590,7 +589,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1.0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -990,6 +989,66 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
   ret void
 }
 
+define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
+; VI-LABEL: add_inline_imm_neg1_0:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT:    v_add_u16_e32 v0, -1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: add_inline_imm_neg1_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: add_inline_imm_neg1_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_u16 v0, v0, 1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_inline_imm_neg1_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v0, v0, 1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %y = add <2 x i16> %x, <i16 -1, i16 0>
+  ret <2 x i16> %y
+}
+
+define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
+; VI-LABEL: add_inline_imm_1_0:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT:    v_add_u16_e32 v0, 1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: add_inline_imm_1_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: add_inline_imm_1_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_inline_imm_1_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %y = add <2 x i16> %x, <i16 1, i16 0>
+  ret <2 x i16> %y
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index cb89841b58f97..d63ebdeb50a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -431,7 +431,7 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
 ;
 ; GFX11-LABEL: ps_mesa_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -468,7 +468,7 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
 ;
 ; GFX11-LABEL: ps_mesa_inreg_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_pk_sub_u16 v0, s0, -1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 329f0a2068cb0..dfc8361c60bbf 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -597,7 +597,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GCN-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
 ; GCN-NEXT:    ds_read_u16_d16 v1, v0 offset:2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
@@ -608,7 +608,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
 ; GFX10-NEXT:    ds_read_u16_d16 v1, v0 offset:2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
@@ -619,7 +619,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16_d16_hi v1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    ds_load_u16_d16 v1, v0 offset:2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
@@ -643,7 +643,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p
 ; GFX900-NEXT:    ds_read_u16_d16_hi v0, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX900-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -654,7 +654,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p
 ; FLATSCR-NEXT:    ds_read_u16_d16_hi v0, v0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v1, v0
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -664,7 +664,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p
 ; GFX10-NEXT:    ds_read_u16 v1, v0 offset:2
 ; GFX10-NEXT:    ds_read_u16_d16_hi v0, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -674,7 +674,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p
 ; GFX11-NEXT:    ds_load_u16 v1, v0 offset:2
 ; GFX11-NEXT:    ds_load_u16_d16_hi v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -694,7 +694,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX900-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
 ; GFX900-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
@@ -705,7 +705,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
 ; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
@@ -716,7 +716,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_DEFAULT-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX10_DEFAULT-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
 ; GFX10_DEFAULT-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v0, v1
@@ -727,7 +727,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR_GFX10-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
@@ -738,7 +738,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    scratch_load_d16_b16 v1, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
@@ -762,7 +762,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX900-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_mov_b32 s4, 0xffff
-; GFX900-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX900-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -774,7 +774,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; FLATSCR-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
-; FLATSCR-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v2, v0
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -785,7 +785,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -796,7 +796,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -820,7 +820,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX900-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -833,7 +833,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v2, v0
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -846,7 +846,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_load_short_d16_hi v0, v[0:1] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -857,7 +857,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[0:1] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 7894f6bc6797d..e12de1debacd6 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -255,8 +255,8 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX10-GISEL:       ; %bb.0: ; %bb
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e, v0 op_sel_hi:[0,1]
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e291e, v0, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
 ; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e64 s4, 0, v0
@@ -288,9 +288,9 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e291e, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, v1 op_sel_hi:[0,1,1]
+; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e291e, v0, v1
 ; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 0ff5ea6ee7b1f..3e658c6f38532 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -77,11 +77,29 @@ define <2 x half> @v_mul_42_v2f16(<2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_42_v2f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_pk_mul_f16 v0, 0x5140, v0 op_sel_hi:[0,1]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_mul_42_v2f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, 0x5140, v0 op_sel_hi:[0,1]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_mul_42_v2f16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, 0x51405140, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_mul_42_v2f16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, 0x5140, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_mul_42_v2f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, 0x51405140, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x half> %x, <half 42.0, half 42.0>
   ret <2 x half> %mul
 }
@@ -3192,11 +3210,29 @@ define <2 x half> @v_mul_16_v2f16(<2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_16_v2f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_mul_16_v2f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_mul_16_v2f16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, 0x4c004c00, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_mul_16_v2f16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_mul_16_v2f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, 0x4c004c00, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x half> %x, <half 16.0, half 16.0>
   ret <2 x half> %mul
 }
@@ -3216,11 +3252,29 @@ define <2 x half> @v_mul_neg16_v2f16(<2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_neg16_v2f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_pk_mul_f16 v0, 0xcc00, v0 op_sel_hi:[0,1]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_mul_neg16_v2f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, 0xcc00, v0 op_sel_hi:[0,1]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_mul_neg16_v2f16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, 0xcc00cc00, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_mul_neg16_v2f16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, 0xcc00, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_mul_neg16_v2f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, 0xcc00cc00, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x half> %x, <half -16.0, half -16.0>
   ret <2 x half> %mul
 }
@@ -3242,12 +3296,33 @@ define <2 x half> @v_mul_fabs_16_v2f16(<2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_fabs_16_v2f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX1011-NEXT:    v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_mul_fabs_16_v2f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_mul_fabs_16_v2f16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, 0x4c004c00, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_mul_fabs_16_v2f16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_mul_fabs_16_v2f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, 0x4c004c00, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
   %mul = fmul <2 x half> %x.fabs, <half 16.0, half 16.0>
   ret <2 x half> %mul
@@ -3268,11 +3343,29 @@ define <2 x half> @v_fma_mul_add_32_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v0, v2, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_fma_mul_add_32_v2f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_pk_fma_f16 v0, 0x5000, v0, v1 op_sel_hi:[0,1,1]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_fma_mul_add_32_v2f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, 0x5000, v0, v1 op_sel_hi:[0,1,1]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_fma_mul_add_32_v2f16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, 0x50005000, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_fma_mul_add_32_v2f16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, 0x5000, v0, v1 op_sel_hi:[0,1,1]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_fma_mul_add_32_v2f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, 0x50005000, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x half> %x, <half 32.0, half 32.0>
   %fma = fadd contract <2 x half> %mul, %y
   ret <2 x half> %fma
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 3afcc7d0d3283..afb3a0273819c 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -480,7 +480,7 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_eq_f16_e64 s[4:5], 0xbc00, s4
+; VI-NEXT:    v_cmp_eq_f16_e64 s[4:5], -1.0, s4
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -492,7 +492,7 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f16_e64 s2, 0xbc00, s2
+; GFX11-NEXT:    v_cmp_eq_f16_e64 s2, -1.0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-NEXT:    s_mov_b32 s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index 8c33004a2316e..b66ca71a32749 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -580,7 +580,7 @@ define amdgpu_kernel void @add_inline_imm_64_v2f16(ptr addrspace(1) %out, <2 x h
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x38003800
 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00]
+; GFX10: v_pk_mul_lo_u16 v0, 0x38003800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x38]
 define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0.5, half 0.5> to <2 x i16>)
   ret <2 x i16> %y
@@ -590,7 +590,7 @@ define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) {
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xb800b800
 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0xb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0x00]
+; GFX10: v_pk_mul_lo_u16 v0, 0xb800b800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0xb8]
 define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half -0.5, half -0.5> to <2 x i16>)
   ret <2 x i16> %y
@@ -600,7 +600,7 @@ define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) {
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00
 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00]
+; GFX10: v_pk_mul_lo_u16 v0, 0x3c003c00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x3c]
 define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half 1.0, half 1.0> to <2 x i16>)
   ret <2 x i16> %y
@@ -610,27 +610,25 @@ define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) {
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00bc00
 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0xbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0x00]
+; GFX10: v_pk_mul_lo_u16 v0, 0xbc00bc00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0xbc]
 define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half -1.0, half -1.0> to <2 x i16>)
   ret <2 x i16> %y
 }
 
 ; GCN-LABEL: {{^}}shl_inline_imm_2.0_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40004000
-; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]]
+; GFX9: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1]
 
-; GFX10: v_pk_lshlrev_b16 v0, v0, 0x4000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x40,0x00,0x00]
+; GFX10: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xe9,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}}]
 define <2 x i16> @shl_inline_imm_2.0_v2i16(<2 x i16> %x) {
   %y = shl <2 x i16> bitcast (<2 x half> <half 2.0, half 2.0> to <2 x i16>), %x
   ret <2 x i16> %y
 }
 
 ; GCN-LABEL: {{^}}shl_inline_imm_neg_2.0_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc000c000
-; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]]
+; GFX9: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1]
 
-; GFX10: v_pk_lshlrev_b16 v0, v0, 0xc000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc0,0x00,0x00]
+; GFX10: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xeb,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}}]
 define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) {
   %y = shl <2 x i16> bitcast (<2 x half> <half -2.0, half -2.0> to <2 x i16>), %x
   ret <2 x i16> %y
@@ -640,7 +638,7 @@ define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) {
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004400
 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00]
+; GFX10: v_pk_mul_lo_u16 v0, 0x44004400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x44]
 define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half 4.0, half 4.0> to <2 x i16>)
   ret <2 x i16> %y
@@ -651,7 +649,7 @@ define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) {
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc400c400
 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0xc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0x00]
+; GFX10: v_pk_mul_lo_u16 v0, 0xc400c400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0xc4]
 define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half -4.0, half -4.0> to <2 x i16>)
   ret <2 x i16> %y
@@ -661,7 +659,7 @@ define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) {
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x31183118
 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00]
+; GFX10: v_pk_mul_lo_u16 v0, 0x31183118, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x18,0x31]
 define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0xH3118, half 0xH3118> to <2 x i16>)
   ret <2 x i16> %y
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index e2a3749c7c471..8874240fae8dc 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -473,89 +473,47 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: clpeak_imad_pat_v2i16:
-; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: clpeak_imad_pat_v2i16:
-; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i16:
-; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i16:
-; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: clpeak_imad_pat_v2i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: clpeak_imad_pat_v2i16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX10-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i16:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: clpeak_imad_pat_v2i16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX11-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i16> %x, <i16 1, i16 1>
   %add = mul <2 x i16> %y18, %y
@@ -733,18 +691,18 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v5, v1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v4, v0
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v4, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v3, v5, -1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v3, v5, 1
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
@@ -775,18 +733,18 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v4, v1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v5, v0
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v3, v4, -1
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
@@ -817,8 +775,8 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX11-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
@@ -828,11 +786,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v3, v4, -1
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
@@ -1130,18 +1088,18 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v5, v1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v4, v0
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v4, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v3, v5, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
@@ -1172,18 +1130,18 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v4, v1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v5, v0
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
@@ -1214,8 +1172,8 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX11-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
@@ -1225,11 +1183,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
@@ -1555,89 +1513,47 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: clpeak_umad_pat_v2i16:
-; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: clpeak_umad_pat_v2i16:
-; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: clpeak_umad_pat_v2i16:
-; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: clpeak_umad_pat_v2i16:
-; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: clpeak_umad_pat_v2i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: clpeak_umad_pat_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: clpeak_umad_pat_v2i16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX10-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: clpeak_umad_pat_v2i16:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: clpeak_umad_pat_v2i16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX11-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i16> %x, <i16 1, i16 1>
   %add = mul <2 x i16> %y18, %y
@@ -1815,18 +1731,18 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX9-SDAG-LABEL: clpeak_umad_pat_v3i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v5, v1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v4, v0
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v4, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v3, v5, -1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v3, v5, 1
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
@@ -1857,18 +1773,18 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX10-SDAG-LABEL: clpeak_umad_pat_v3i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v4, v1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v5, v0
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v3, v4, -1
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
@@ -1899,8 +1815,8 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX11-SDAG-LABEL: clpeak_umad_pat_v3i16:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
@@ -1910,11 +1826,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v3, v4, -1
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
@@ -2212,18 +2128,18 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX9-SDAG-LABEL: clpeak_umad_pat_v4i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v5, v1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v4, v0
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v4, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v3, v5, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
@@ -2254,18 +2170,18 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX10-SDAG-LABEL: clpeak_umad_pat_v4i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v4, v1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v5, v0
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
@@ -2296,8 +2212,8 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX11-SDAG-LABEL: clpeak_umad_pat_v4i16:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
@@ -2307,11 +2223,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
@@ -7277,143 +7193,74 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: clpeak_imad_pat_v2i16_x2:
-; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
-; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i16_x2:
-; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v2, v1, v2
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
-; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v1, v2
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: clpeak_imad_pat_v2i16_x2:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, v1
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, v1
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i16_x2:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v2, v1, v2
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: clpeak_imad_pat_v2i16_x2:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX10-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX10-NEXT:    v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
+; GFX10-NEXT:    v_pk_add_u16 v2, v1, v2
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
+; GFX10-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX10-NEXT:    v_pk_add_u16 v1, v2, v1
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX10-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v1, v2
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: clpeak_imad_pat_v2i16_x2:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX11-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX11-NEXT:    v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_add_u16 v2, v1, v2
+; GFX11-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX11-NEXT:    v_pk_add_u16 v1, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add <2 x i16> %x, <i16 1, i16 1>
   %add = mul <2 x i16> %y38, %y
@@ -7654,143 +7501,74 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: clpeak_umad_pat_v2i16_x2:
-; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
-; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: clpeak_umad_pat_v2i16_x2:
-; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v2, v1, v2
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
-; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v1, v2
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: clpeak_umad_pat_v2i16_x2:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, v1
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, v1
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: clpeak_umad_pat_v2i16_x2:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v2, v1, v2
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: clpeak_umad_pat_v2i16_x2:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX10-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX10-NEXT:    v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
+; GFX10-NEXT:    v_pk_add_u16 v2, v1, v2
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
+; GFX10-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX10-NEXT:    v_pk_add_u16 v1, v2, v1
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX10-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v1, v2
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v2, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: clpeak_umad_pat_v2i16_x2:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX11-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX11-NEXT:    v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_pk_mul_lo_u16 v1, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_add_u16 v2, v1, v2
+; GFX11-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
+; GFX11-NEXT:    v_pk_add_u16 v1, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add <2 x i16> %x, <i16 1, i16 1>
   %add = mul <2 x i16> %y38, %y
@@ -8373,6 +8151,24 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: mul_u24_add64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: mul_u24_add64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v0, v1
+; GFX11-GISEL-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y)
   %add = add i64 %mul, %z
   ret i64 %add
@@ -8410,6 +8206,15 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_u24_zext_add64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
   %mul.zext = zext i32 %mul to i64
   %add = add i64 %mul.zext, %z
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 54bd78e2ea127..66f159f5ab7b2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -75,26 +75,15 @@ entry:
 ; Make sure we do not violate constant bus restriction with 3 scalar inputs and simingly inlinable literal.
 
 define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
-; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
-; SDAG-GFX11:       ; %bb.0: ; %entry
-; SDAG-GFX11-NEXT:    v_mov_b32_e32 v2, s1
-; SDAG-GFX11-NEXT:    s_mov_b32 s1, 0x10001
-; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; SDAG-GFX11-NEXT:    v_dot2_bf16_bf16 v2, s0, s1, v2
-; SDAG-GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; SDAG-GFX11-NEXT:    s_endpgm
-;
-; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
-; GISEL-GFX11:       ; %bb.0: ; %entry
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v2, 0x10001
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_dot2_bf16_bf16 v2, s0, v2, s1
-; GISEL-GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GISEL-GFX11-NEXT:    s_endpgm
+; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dot2_bf16_bf16 v2, s0, 0x10001, v2
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     <2 x i16> inreg %a,
     i16 inreg %c) {
diff --git a/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll b/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
index 81918f5ca0eff..e96570d99472e 100644
--- a/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
+++ b/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
@@ -23,7 +23,7 @@ bb:
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
-  %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH3C00, half 0xH0000>) 
+  %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH3C00, half 0xH0000>)
   store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
@@ -96,7 +96,7 @@ bb:
 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_41c8:
 ; GFX9:  s_mov_b32 [[C:s[0-9]+]], 0x41c80000
 ; GFX9:  v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, [[C]]{{$}}
-; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c8, v{{[0-9]+}} op_sel:[1,0] op_sel_hi:[0,1]{{$}}
+; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c80000, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_pk_max_f16_literal_0_41c8(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 536b2d054272e..3c654e9e2c9e1 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -1622,14 +1622,14 @@ define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-LABEL: v_mul_add_1_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %y, <i16 1, i16 1>
@@ -1665,14 +1665,14 @@ define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-LABEL: v_mul_add_1_v2i16_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i16_commute:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %y, <i16 1, i16 1>
@@ -1886,14 +1886,14 @@ define <2 x i16> @v_mul_add_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-LABEL: v_mul_add_2_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, -2 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_2_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, -2 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %y, <i16 2, i16 2>
@@ -2929,14 +2929,14 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, 5 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_5_add_1_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, 5 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i16> %arg, <i16 5, i16 5>
   %add = add <2 x i16> %mul, <i16 1, i16 1>
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index a8ae8c0c38792..73f2834f0cdf4 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -2399,7 +2399,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0xc4000000
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2410,7 +2410,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0xc4000000
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2534,7 +2534,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x44000000
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2545,7 +2545,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0x44000000
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2645,76 +2645,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffe0ffe0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2803,76 +2767,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffe00000
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2963,76 +2891,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffe0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3128,75 +3020,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3265,95 +3122,60 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, -16
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; VI-GISEL-NEXT:    s_endpgm
-;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, -16
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3444,75 +3266,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3613,9 +3400,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x3c003c00
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0xc400c400
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, s2
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2
 ; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -3631,53 +3418,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400, v1 op_sel_hi:[0,1]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
-;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400, v1 op_sel_hi:[0,1]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3778,9 +3541,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0xbc00bc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x44004400
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, s2
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2
 ; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -3796,53 +3559,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
-;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3937,77 +3676,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0xc000c000
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, s2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x40004000
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0x4000, v1 op_sel_hi:[0,1]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0x4000, v1 op_sel_hi:[0,1]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -4102,77 +3804,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x40004000
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, s2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc000c000
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xc000, v1 op_sel_hi:[0,1]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xc000, v1 op_sel_hi:[0,1]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -4260,76 +3925,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffe00000
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX10-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1]
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1]
-; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -4455,7 +4084,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffffffe0, v1
 ; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
@@ -4479,7 +4108,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffffffe0, v1
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 9a6851c162d09..b237703537bff 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -6,7 +6,7 @@
 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
 
 ; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
 ; CIVI: s_sub_i32
@@ -30,7 +30,7 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
 ; GFX9: global_load_dword [[VAL:v[0-9]+]]
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
 
 ; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; VI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
@@ -70,7 +70,7 @@ define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) #0 {
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
@@ -88,7 +88,7 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val)
 ; GFX9: global_load_dword [[VAL:v[0-9]+]]
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
@@ -111,8 +111,8 @@ define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[#LOAD + 3]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[#LOAD + 2]], [[SUB0]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[#LOAD + 3]], [[SUB1]]
-; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0]
-; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0]
+; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
+; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 {
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
@@ -135,11 +135,11 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
 
 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]]
-; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0]
+; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
 
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
-; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0]
+; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index aedf06d66428a..a2712ecf74f04 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -427,7 +427,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -460,7 +460,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, -1
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -473,7 +473,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -562,13 +562,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 1.0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -600,7 +599,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 1.0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -613,7 +612,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 1.0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index e46992ccbbc57..819e5e81e4894 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -369,13 +369,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
 ; SDAG-GFX9-NEXT:    v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0]
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: vec_smax_smin:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_i16 v0, v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: vec_smax_smin:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_pk_max_i16 v0, v0, 0
+; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-NEXT:    v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: vec_smax_smin:
 ; GISEL-VI:       ; %bb.0:
@@ -396,6 +396,14 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 0xff00ff
 ; GISEL-GFX9-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: vec_smax_smin:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_pk_max_i16 v0, v0, 0
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_pk_min_i16 v0, 0xff00ff, v0
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
   %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> <i16 0, i16 0>)
   %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> <i16 255, i16 255>)
   ret <2 x i16> %src.clamp
@@ -548,13 +556,13 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) {
 ; SDAG-GFX9-NEXT:    v_pk_max_i16 v0, v0, 0
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: vec_smin_smax:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_i16 v0, v0, 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: vec_smin_smax:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
+; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-NEXT:    v_pk_max_i16 v0, v0, 0
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: vec_smin_smax:
 ; GISEL-VI:       ; %bb.0:
@@ -575,7 +583,17 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) {
 ; GISEL-GFX9-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GISEL-GFX9-NEXT:    v_pk_max_i16 v0, v0, 0
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: vec_smin_smax:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_pk_min_i16 v0, 0xff00ff, v0
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_pk_max_i16 v0, v0, 0
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
   %src.min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src, <2 x i16> <i16 255, i16 255>)
   %src.clamp = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src.min, <2 x i16> <i16 0, i16 0>)
   ret <2 x i16> %src.clamp
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s
index 45a320a3e358e..829b0ebb8d8ac 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s
@@ -463,7 +463,7 @@ v_pk_add_i16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_add_i16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x02,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_add_i16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x18]
@@ -477,9 +477,12 @@ v_pk_add_i16 v5, null, exec_lo
 v_pk_add_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x02,0xcc,0xc1,0xfe,0x00,0x00]
 
-v_pk_add_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
+v_pk_add_i16 v5, 0x3800, m0 op_sel:[0,0] op_sel_hi:[1,1]
 // GFX11: [0x05,0x40,0x02,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
 
+v_pk_add_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX11: [0x05,0x40,0x02,0xcc,0xf0,0xfa,0x00,0x18]
+
 v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x10]
 
@@ -508,7 +511,7 @@ v_pk_add_u16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_add_u16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x0a,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_add_u16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x18]
@@ -523,7 +526,7 @@ v_pk_add_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x0a,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_add_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x0a,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x0a,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x10]
@@ -553,7 +556,7 @@ v_pk_ashrrev_i16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_ashrrev_i16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x06,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_ashrrev_i16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x18]
@@ -568,7 +571,7 @@ v_pk_ashrrev_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x06,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x06,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x06,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x10]
@@ -643,7 +646,7 @@ v_pk_lshlrev_b16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_lshlrev_b16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x04,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_lshlrev_b16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x18]
@@ -658,7 +661,7 @@ v_pk_lshlrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x04,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x04,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x04,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x04,0xcc,0xfd,0xd4,0x00,0x10]
@@ -688,7 +691,7 @@ v_pk_lshrrev_b16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_lshrrev_b16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x05,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_lshrrev_b16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x18]
@@ -703,7 +706,7 @@ v_pk_lshrrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x05,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x05,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x05,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x10]
@@ -733,7 +736,7 @@ v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15
 // GFX11: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19]
 
 v_pk_mad_i16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0]
-// GFX11: [0x05,0x00,0x00,0xcc,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x00,0x00,0xcc,0x7d,0xe0,0xf5,0x01]
 
 v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1]
 // GFX11: [0x05,0x40,0x00,0xcc,0x7e,0x82,0xad,0x01]
@@ -748,7 +751,7 @@ v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1]
 // GFX11: [0x05,0x40,0x00,0xcc,0xc1,0xfe,0xf4,0x1b]
 
 v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1]
-// GFX11: [0x05,0x48,0x00,0xcc,0xff,0xfa,0xfc,0x13,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13]
 
 v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 // GFX11: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b]
@@ -778,7 +781,7 @@ v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15
 // GFX11: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19]
 
 v_pk_mad_u16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0]
-// GFX11: [0x05,0x00,0x09,0xcc,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x00,0x09,0xcc,0x7d,0xe0,0xf5,0x01]
 
 v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1]
 // GFX11: [0x05,0x40,0x09,0xcc,0x7e,0x82,0xad,0x01]
@@ -793,7 +796,7 @@ v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1]
 // GFX11: [0x05,0x40,0x09,0xcc,0xc1,0xfe,0xf4,0x1b]
 
 v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1]
-// GFX11: [0x05,0x48,0x09,0xcc,0xff,0xfa,0xfc,0x13,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13]
 
 v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 // GFX11: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b]
@@ -868,7 +871,7 @@ v_pk_max_i16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_max_i16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x07,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_max_i16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x18]
@@ -883,7 +886,7 @@ v_pk_max_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x07,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_max_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x07,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x07,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x10]
@@ -913,7 +916,7 @@ v_pk_max_u16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_max_u16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x0c,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_max_u16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x18]
@@ -928,7 +931,7 @@ v_pk_max_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x0c,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_max_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x0c,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x0c,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1003,7 +1006,7 @@ v_pk_min_i16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_min_i16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x08,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_min_i16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x18]
@@ -1018,7 +1021,7 @@ v_pk_min_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x08,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_min_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x08,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x08,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1048,7 +1051,7 @@ v_pk_min_u16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_min_u16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x0d,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_min_u16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x18]
@@ -1063,7 +1066,7 @@ v_pk_min_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x0d,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_min_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x0d,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x0d,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1138,7 +1141,7 @@ v_pk_mul_lo_u16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_mul_lo_u16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x01,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_mul_lo_u16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x18]
@@ -1153,7 +1156,7 @@ v_pk_mul_lo_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x01,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x01,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x01,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1183,7 +1186,7 @@ v_pk_sub_i16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_sub_i16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x03,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_sub_i16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x18]
@@ -1198,7 +1201,7 @@ v_pk_sub_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x03,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_sub_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x03,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x03,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1228,7 +1231,7 @@ v_pk_sub_u16 v5, ttmp15, src_scc
 // GFX11: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_sub_u16 v5, m0, 0.5
-// GFX11: [0x05,0x40,0x0b,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_sub_u16 v5, exec_lo, -1
 // GFX11: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x18]
@@ -1243,7 +1246,7 @@ v_pk_sub_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX11: [0x05,0x58,0x0b,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_sub_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX11: [0x05,0x40,0x0b,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX11: [0x05,0x40,0x0b,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX11: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x10]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s
index 9a21f7a2eb560..a8347fb7f08bd 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s
@@ -463,7 +463,7 @@ v_pk_add_i16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_add_i16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x02,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_add_i16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x18]
@@ -478,7 +478,7 @@ v_pk_add_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x02,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_add_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x02,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x02,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x10]
@@ -508,7 +508,7 @@ v_pk_add_u16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_add_u16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x0a,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_add_u16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x18]
@@ -523,7 +523,7 @@ v_pk_add_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x0a,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_add_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x0a,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x0a,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x10]
@@ -553,7 +553,7 @@ v_pk_ashrrev_i16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_ashrrev_i16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x06,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_ashrrev_i16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x18]
@@ -568,7 +568,7 @@ v_pk_ashrrev_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x06,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x06,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x06,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x10]
@@ -643,7 +643,7 @@ v_pk_lshlrev_b16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_lshlrev_b16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x04,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_lshlrev_b16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x18]
@@ -658,6 +658,9 @@ v_pk_lshlrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x04,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX12: [0x05,0x40,0x04,0xcc,0xf0,0xfa,0x00,0x18]
+
+v_pk_lshlrev_b16 v5, 0x3800, m0 op_sel:[0,0] op_sel_hi:[1,1]
 // GFX12: [0x05,0x40,0x04,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
 
 v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
@@ -688,7 +691,7 @@ v_pk_lshrrev_b16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_lshrrev_b16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x05,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_lshrrev_b16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x18]
@@ -703,7 +706,7 @@ v_pk_lshrrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x05,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x05,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x05,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x10]
@@ -733,7 +736,7 @@ v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15
 // GFX12: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19]
 
 v_pk_mad_i16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0]
-// GFX12: [0x05,0x00,0x00,0xcc,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x00,0x00,0xcc,0x7d,0xe0,0xf5,0x01]
 
 v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1]
 // GFX12: [0x05,0x40,0x00,0xcc,0x7e,0x82,0xad,0x01]
@@ -748,7 +751,7 @@ v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1]
 // GFX12: [0x05,0x40,0x00,0xcc,0xc1,0xfe,0xf4,0x1b]
 
 v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1]
-// GFX12: [0x05,0x48,0x00,0xcc,0xff,0xfa,0xfc,0x13,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13]
 
 v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 // GFX12: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b]
@@ -778,7 +781,7 @@ v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15
 // GFX12: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19]
 
 v_pk_mad_u16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0]
-// GFX12: [0x05,0x00,0x09,0xcc,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x00,0x09,0xcc,0x7d,0xe0,0xf5,0x01]
 
 v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1]
 // GFX12: [0x05,0x40,0x09,0xcc,0x7e,0x82,0xad,0x01]
@@ -793,7 +796,7 @@ v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1]
 // GFX12: [0x05,0x40,0x09,0xcc,0xc1,0xfe,0xf4,0x1b]
 
 v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1]
-// GFX12: [0x05,0x48,0x09,0xcc,0xff,0xfa,0xfc,0x13,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13]
 
 v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 // GFX12: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b]
@@ -868,7 +871,7 @@ v_pk_max_i16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_max_i16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x07,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_max_i16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x18]
@@ -883,7 +886,7 @@ v_pk_max_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x07,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_max_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x07,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x07,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x10]
@@ -913,7 +916,7 @@ v_pk_max_u16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_max_u16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x0c,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_max_u16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x18]
@@ -928,7 +931,7 @@ v_pk_max_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x0c,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_max_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x0c,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x0c,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1003,7 +1006,7 @@ v_pk_min_i16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_min_i16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x08,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_min_i16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x18]
@@ -1018,7 +1021,7 @@ v_pk_min_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x08,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_min_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x08,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x08,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1048,7 +1051,7 @@ v_pk_min_u16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_min_u16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x0d,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_min_u16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x18]
@@ -1063,7 +1066,7 @@ v_pk_min_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x0d,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_min_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x0d,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x0d,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1138,7 +1141,7 @@ v_pk_mul_lo_u16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_mul_lo_u16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x01,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_mul_lo_u16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x18]
@@ -1153,7 +1156,7 @@ v_pk_mul_lo_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x01,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x01,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x01,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1183,7 +1186,7 @@ v_pk_sub_i16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_sub_i16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x03,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_sub_i16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x18]
@@ -1198,7 +1201,7 @@ v_pk_sub_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x03,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_sub_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x03,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x03,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1228,7 +1231,7 @@ v_pk_sub_u16 v5, ttmp15, src_scc
 // GFX12: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18]
 
 v_pk_sub_u16 v5, m0, 0.5
-// GFX12: [0x05,0x40,0x0b,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18]
 
 v_pk_sub_u16 v5, exec_lo, -1
 // GFX12: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x18]
@@ -1243,7 +1246,7 @@ v_pk_sub_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0]
 // GFX12: [0x05,0x58,0x0b,0xcc,0xc1,0xfe,0x00,0x00]
 
 v_pk_sub_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1]
-// GFX12: [0x05,0x40,0x0b,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00]
+// GFX12: [0x05,0x40,0x0b,0xcc,0xf0,0xfa,0x00,0x18]
 
 v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1]
 // GFX12: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x10]
diff --git a/llvm/test/MC/AMDGPU/literalv216.s b/llvm/test/MC/AMDGPU/literalv216.s
index 5b1c7a76ca853..c695bc3600c38 100644
--- a/llvm/test/MC/AMDGPU/literalv216.s
+++ b/llvm/test/MC/AMDGPU/literalv216.s
@@ -113,6 +113,10 @@ v_pk_add_f16 v1, 0x0001, v2
 // GFX10: v_pk_add_f16 v1, 1, v2  ; encoding: [0x01,0x40,0x0f,0xcc,0x81,0x04,0x02,0x18]
 
 v_pk_add_f16 v1, 0xffff, v2
+// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
+// GFX10: v_pk_add_f16 v1, 0xffff, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xff,0x04,0x02,0x18,0xff,0xff,0x00,0x00]
+
+v_pk_add_f16 v1, 0xffffffff, v2
 // GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x40,0x8f,0xd3,0xc1,0x04,0x02,0x18]
 // GFX10: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xc1,0x04,0x02,0x18]
 
@@ -153,6 +157,10 @@ v_pk_add_f16 v1, 0x3118, v2
 // GFX10: v_pk_add_f16 v1, 0.15915494, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xf8,0x04,0x02,0x18]
 
 v_pk_add_f16 v1, 65535, v2
+// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
+// GFX10: v_pk_add_f16 v1, 0xffff, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xff,0x04,0x02,0x18,0xff,0xff,0x00,0x00]
+
+v_pk_add_f16 v1, 4294967295, v2
 // GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x40,0x8f,0xd3,0xc1,0x04,0x02,0x18]
 // GFX10: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xc1,0x04,0x02,0x18]
 
@@ -242,7 +250,7 @@ v_pk_add_f16 v5, v1, 0.1234
 
 v_pk_add_u16 v5, v1, 0.1234
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
-// GFX10: v_pk_add_u16 v5, v1, 0x2fe6     ; encoding: [0x05,0x40,0x0a,0xcc,0x01,0xff,0x01,0x18,0xe6,0x2f,0x00,0x00]
+// GFX10: v_pk_add_u16 v5, v1, 0x3dfcb924 ; encoding: [0x05,0x40,0x0a,0xcc,0x01,0xff,0x01,0x18,0x24,0xb9,0xfc,0x3d]
 
 v_pk_fma_f16 v5, 0.1234, v2, v3
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
@@ -258,23 +266,23 @@ v_pk_fma_f16 v5, v1, v2, 0.1234
 
 v_pk_mad_i16 v5, 0.1234, v2, v3
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
-// GFX10: v_pk_mad_i16 v5, 0x2fe6, v2, v3 ; encoding: [0x05,0x40,0x00,0xcc,0xff,0x04,0x0e,0x1c,0xe6,0x2f,0x00,0x00]
+// GFX10: v_pk_mad_i16 v5, 0x3dfcb924, v2, v3 ; encoding: [0x05,0x40,0x00,0xcc,0xff,0x04,0x0e,0x1c,0x24,0xb9,0xfc,0x3d]
 
 v_pk_mad_i16 v5, v1, 0.1234, v3
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
-// GFX10: v_pk_mad_i16 v5, v1, 0x2fe6, v3 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0xff,0x0d,0x1c,0xe6,0x2f,0x00,0x00]
+// GFX10: v_pk_mad_i16 v5, v1, 0x3dfcb924, v3 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0xff,0x0d,0x1c,0x24,0xb9,0xfc,0x3d]
 
 v_pk_mad_i16 v5, v1, v2, 0.1234
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
-// GFX10: v_pk_mad_i16 v5, v1, v2, 0x2fe6 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0x05,0xfe,0x1b,0xe6,0x2f,0x00,0x00]
+// GFX10: v_pk_mad_i16 v5, v1, v2, 0x3dfcb924 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0x05,0xfe,0x1b,0x24,0xb9,0xfc,0x3d]
 
 v_pk_add_f16 v5, v1, 123456.0
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // NOGFX10: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_pk_add_u16 v5, v1, 123456.0
-// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-// NOGFX10: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
+// GFX10: v_pk_add_u16 v5, v1, 0x47f12000 ; encoding: [0x05,0x40,0x0a,0xcc,0x01,0xff,0x01,0x18,0x00,0x20,0xf1,0x47]
 
 //===----------------------------------------------------------------------===//
 // Packed VOP2
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
index e42d0de5db86f..a022c79fe97e6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
@@ -79,7 +79,7 @@
 # GFX10: v_pk_fma_f16 v5, -1, -2, -3     ; encoding: [0x05,0x40,0x0e,0xcc,0xc1,0x84,0x0d,0x1b]
 0x05,0x40,0x0e,0xcc,0xc1,0x84,0x0d,0x1b
 
-# GFX10: v_pk_mad_i16 v5, 0x3c00, 0x4000, 0x4400  ; encoding: [0x05,0x40,0x00,0xcc,0xff,0xfe,0xfd,0x1b,0x00,0x3c,0x00,0x00]
+# GFX10: v_pk_mad_i16 v5, 1.0, 2.0, 4.0  ; encoding: [0x05,0x40,0x00,0xcc,0xf2,0xe8,0xd9,0x1b]
 0x05,0x40,0x00,0xcc,0xf2,0xe8,0xd9,0x1b
 
 # GFX10: v_pk_mad_u16 v5, -1, -2, -3     ; encoding: [0x05,0x40,0x09,0xcc,0xc1,0x84,0x0d,0x1b]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt
index bc2cb5f06c4f4..838e6e0d814d5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt
@@ -466,7 +466,7 @@
 # GFX11: v_pk_add_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_add_i16 v5, m0, 0x3800
+# GFX11: v_pk_add_i16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_add_i16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x18]
@@ -481,7 +481,7 @@
 # GFX11: v_pk_add_i16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x02,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x02,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_add_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_add_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x02,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x02,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x10]
@@ -511,7 +511,7 @@
 # GFX11: v_pk_add_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_add_u16 v5, m0, 0x3800
+# GFX11: v_pk_add_u16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_add_u16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x18]
@@ -526,7 +526,7 @@
 # GFX11: v_pk_add_u16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x0a,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x0a,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_add_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_add_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0a,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x0a,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x10]
@@ -556,7 +556,7 @@
 # GFX11: v_pk_ashrrev_i16 v5, ttmp15, src_scc    ; encoding: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_ashrrev_i16 v5, m0, 0x3800
+# GFX11: v_pk_ashrrev_i16 v5, m0, 0.5            ; encoding: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_ashrrev_i16 v5, exec_lo, -1        ; encoding: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x18]
@@ -571,7 +571,7 @@
 # GFX11: v_pk_ashrrev_i16 v5, -1, exec_hi        ; encoding: [0x05,0x40,0x06,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x06,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_ashrrev_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x06,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x06,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x10]
@@ -646,7 +646,7 @@
 # GFX11: v_pk_lshlrev_b16 v5, ttmp15, src_scc    ; encoding: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_lshlrev_b16 v5, m0, 0x3800
+# GFX11: v_pk_lshlrev_b16 v5, m0, 0.5            ; encoding: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_lshlrev_b16 v5, exec_lo, -1        ; encoding: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x18]
@@ -661,7 +661,7 @@
 # GFX11: v_pk_lshlrev_b16 v5, -1, exec_hi        ; encoding: [0x05,0x40,0x04,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x04,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_lshlrev_b16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x04,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x04,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x04,0xcc,0xfd,0xd4,0x00,0x10]
@@ -691,7 +691,7 @@
 # GFX11: v_pk_lshrrev_b16 v5, ttmp15, src_scc    ; encoding: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_lshrrev_b16 v5, m0, 0x3800
+# GFX11: v_pk_lshrrev_b16 v5, m0, 0.5            ; encoding: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_lshrrev_b16 v5, exec_lo, -1        ; encoding: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x18]
@@ -706,7 +706,7 @@
 # GFX11: v_pk_lshrrev_b16 v5, -1, exec_hi        ; encoding: [0x05,0x40,0x05,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x05,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_lshrrev_b16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x05,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x05,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x10]
@@ -736,7 +736,7 @@
 # GFX11: v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19]
 0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19
 
-# GFX11: v_pk_mad_i16 v5, m0, 0x3800, m0
+# GFX11: v_pk_mad_i16 v5, m0, 0.5, m0            ; encoding: [0x05,0x40,0x00,0xcc,0x7d,0xe0,0xf5,0x19]
 0x05,0x40,0x00,0xcc,0x7d,0xe0,0xf5,0x19
 
 # GFX11: v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x00,0xcc,0x7e,0x82,0xad,0x01]
@@ -751,7 +751,7 @@
 # GFX11: v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x00,0xcc,0xc1,0xfe,0xf4,0x0b]
 0x05,0x38,0x00,0xcc,0xc1,0xfe,0xf4,0x0b
 
-# GFX11: v_pk_mad_i16 v5, 0x3800, m0, 0x3800 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+# GFX11: v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13]
 0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13
 
 # GFX11: v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b]
@@ -781,7 +781,7 @@
 # GFX11: v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19]
 0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19
 
-# GFX11: v_pk_mad_u16 v5, m0, 0x3800, m0
+# GFX11: v_pk_mad_u16 v5, m0, 0.5, m0            ; encoding: [0x05,0x40,0x09,0xcc,0x7d,0xe0,0xf5,0x19]
 0x05,0x40,0x09,0xcc,0x7d,0xe0,0xf5,0x19
 
 # GFX11: v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x09,0xcc,0x7e,0x82,0xad,0x01]
@@ -796,7 +796,7 @@
 # GFX11: v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x09,0xcc,0xc1,0xfe,0xf4,0x0b]
 0x05,0x38,0x09,0xcc,0xc1,0xfe,0xf4,0x0b
 
-# GFX11: v_pk_mad_u16 v5, 0x3800, m0, 0x3800 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+# GFX11: v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13]
 0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13
 
 # GFX11: v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b]
@@ -871,7 +871,7 @@
 # GFX11: v_pk_max_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_max_i16 v5, m0, 0x3800
+# GFX11: v_pk_max_i16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_max_i16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x18]
@@ -886,7 +886,7 @@
 # GFX11: v_pk_max_i16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x07,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x07,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_max_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_max_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x07,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x07,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x10]
@@ -916,7 +916,7 @@
 # GFX11: v_pk_max_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_max_u16 v5, m0, 0x3800
+# GFX11: v_pk_max_u16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_max_u16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x18]
@@ -931,7 +931,7 @@
 # GFX11: v_pk_max_u16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x0c,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x0c,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_max_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_max_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0c,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x0c,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1006,7 +1006,7 @@
 # GFX11: v_pk_min_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_min_i16 v5, m0, 0x3800
+# GFX11: v_pk_min_i16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_min_i16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x18]
@@ -1021,7 +1021,7 @@
 # GFX11: v_pk_min_i16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x08,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x08,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_min_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_min_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x08,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x08,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1051,7 +1051,7 @@
 # GFX11: v_pk_min_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_min_u16 v5, m0, 0x3800
+# GFX11: v_pk_min_u16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_min_u16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x18]
@@ -1066,7 +1066,7 @@
 # GFX11: v_pk_min_u16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x0d,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x0d,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_min_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_min_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0d,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x0d,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1141,7 +1141,7 @@
 # GFX11: v_pk_mul_lo_u16 v5, ttmp15, src_scc     ; encoding: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_mul_lo_u16 v5, m0, 0x3800
+# GFX11: v_pk_mul_lo_u16 v5, m0, 0.5             ; encoding: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_mul_lo_u16 v5, exec_lo, -1         ; encoding: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x18]
@@ -1156,7 +1156,7 @@
 # GFX11: v_pk_mul_lo_u16 v5, -1, exec_hi         ; encoding: [0x05,0x40,0x01,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x01,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_mul_lo_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x01,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x01,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1186,7 +1186,7 @@
 # GFX11: v_pk_sub_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_sub_i16 v5, m0, 0x3800
+# GFX11: v_pk_sub_i16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_sub_i16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x18]
@@ -1201,7 +1201,7 @@
 # GFX11: v_pk_sub_i16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x03,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x03,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_sub_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_sub_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x03,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x03,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1231,7 +1231,7 @@
 # GFX11: v_pk_sub_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX11: v_pk_sub_u16 v5, m0, 0x3800
+# GFX11: v_pk_sub_u16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX11: v_pk_sub_u16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x18]
@@ -1246,7 +1246,7 @@
 # GFX11: v_pk_sub_u16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x0b,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x0b,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX11: v_pk_sub_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX11: v_pk_sub_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0b,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x0b,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX11: v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x10]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt
index 373cd71261449..44d8995c5c436 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt
@@ -463,7 +463,7 @@
 # GFX12: v_pk_add_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_add_i16 v5, m0, 0x3800
+# GFX12: v_pk_add_i16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_add_i16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x18]
@@ -478,7 +478,7 @@
 # GFX12: v_pk_add_i16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x02,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x02,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_add_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_add_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x02,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x02,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x10]
@@ -508,7 +508,7 @@
 # GFX12: v_pk_add_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_add_u16 v5, m0, 0x3800
+# GFX12: v_pk_add_u16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_add_u16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x18]
@@ -523,7 +523,7 @@
 # GFX12: v_pk_add_u16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x0a,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x0a,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_add_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_add_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0a,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x0a,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x10]
@@ -553,7 +553,7 @@
 # GFX12: v_pk_ashrrev_i16 v5, ttmp15, src_scc    ; encoding: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_ashrrev_i16 v5, m0, 0x3800
+# GFX12: v_pk_ashrrev_i16 v5, m0, 0.5            ; encoding: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_ashrrev_i16 v5, exec_lo, -1        ; encoding: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x18]
@@ -568,7 +568,7 @@
 # GFX12: v_pk_ashrrev_i16 v5, -1, exec_hi        ; encoding: [0x05,0x40,0x06,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x06,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_ashrrev_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x06,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x06,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x10]
@@ -643,7 +643,7 @@
 # GFX12: v_pk_lshlrev_b16 v5, ttmp15, src_scc    ; encoding: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_lshlrev_b16 v5, m0, 0x3800
+# GFX12: v_pk_lshlrev_b16 v5, m0, 0.5            ; encoding: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_lshlrev_b16 v5, exec_lo, -1        ; encoding: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x18]
@@ -658,7 +658,7 @@
 # GFX12: v_pk_lshlrev_b16 v5, -1, exec_hi        ; encoding: [0x05,0x40,0x04,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x04,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_lshlrev_b16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x04,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x04,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x04,0xcc,0xfd,0xd4,0x00,0x10]
@@ -688,7 +688,7 @@
 # GFX12: v_pk_lshrrev_b16 v5, ttmp15, src_scc    ; encoding: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_lshrrev_b16 v5, m0, 0x3800
+# GFX12: v_pk_lshrrev_b16 v5, m0, 0.5            ; encoding: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_lshrrev_b16 v5, exec_lo, -1        ; encoding: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x18]
@@ -703,7 +703,7 @@
 # GFX12: v_pk_lshrrev_b16 v5, -1, exec_hi        ; encoding: [0x05,0x40,0x05,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x05,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_lshrrev_b16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x05,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x05,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x10]
@@ -733,7 +733,7 @@
 # GFX12: v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19]
 0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19
 
-# GFX12: v_pk_mad_i16 v5, m0, 0x3800, m0
+# GFX12: v_pk_mad_i16 v5, m0, 0.5, m0            ; encoding: [0x05,0x40,0x00,0xcc,0x7d,0xe0,0xf5,0x19]
 0x05,0x40,0x00,0xcc,0x7d,0xe0,0xf5,0x19
 
 # GFX12: v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x00,0xcc,0x7e,0x82,0xad,0x01]
@@ -748,7 +748,7 @@
 # GFX12: v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x00,0xcc,0xc1,0xfe,0xf4,0x0b]
 0x05,0x38,0x00,0xcc,0xc1,0xfe,0xf4,0x0b
 
-# GFX12: v_pk_mad_i16 v5, 0x3800, m0, 0x3800 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+# GFX12: v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13]
 0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13
 
 # GFX12: v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b]
@@ -778,7 +778,7 @@
 # GFX12: v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19]
 0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19
 
-# GFX12: v_pk_mad_u16 v5, m0, 0x3800, m0
+# GFX12: v_pk_mad_u16 v5, m0, 0.5, m0            ; encoding: [0x05,0x40,0x09,0xcc,0x7d,0xe0,0xf5,0x19]
 0x05,0x40,0x09,0xcc,0x7d,0xe0,0xf5,0x19
 
 # GFX12: v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x09,0xcc,0x7e,0x82,0xad,0x01]
@@ -793,7 +793,7 @@
 # GFX12: v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x09,0xcc,0xc1,0xfe,0xf4,0x0b]
 0x05,0x38,0x09,0xcc,0xc1,0xfe,0xf4,0x0b
 
-# GFX12: v_pk_mad_u16 v5, 0x3800, m0, 0x3800 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+# GFX12: v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13]
 0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13
 
 # GFX12: v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b]
@@ -868,7 +868,7 @@
 # GFX12: v_pk_max_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_max_i16 v5, m0, 0x3800
+# GFX12: v_pk_max_i16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_max_i16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x18]
@@ -883,7 +883,7 @@
 # GFX12: v_pk_max_i16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x07,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x07,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_max_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_max_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x07,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x07,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x10]
@@ -913,7 +913,7 @@
 # GFX12: v_pk_max_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_max_u16 v5, m0, 0x3800
+# GFX12: v_pk_max_u16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_max_u16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x18]
@@ -928,7 +928,7 @@
 # GFX12: v_pk_max_u16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x0c,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x0c,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_max_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_max_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0c,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x0c,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1003,7 +1003,7 @@
 # GFX12: v_pk_min_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_min_i16 v5, m0, 0x3800
+# GFX12: v_pk_min_i16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_min_i16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x18]
@@ -1018,7 +1018,7 @@
 # GFX12: v_pk_min_i16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x08,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x08,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_min_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_min_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x08,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x08,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1048,7 +1048,7 @@
 # GFX12: v_pk_min_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_min_u16 v5, m0, 0x3800
+# GFX12: v_pk_min_u16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_min_u16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x18]
@@ -1063,7 +1063,7 @@
 # GFX12: v_pk_min_u16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x0d,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x0d,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_min_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_min_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0d,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x0d,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1138,7 +1138,7 @@
 # GFX12: v_pk_mul_lo_u16 v5, ttmp15, src_scc     ; encoding: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_mul_lo_u16 v5, m0, 0x3800
+# GFX12: v_pk_mul_lo_u16 v5, m0, 0.5             ; encoding: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_mul_lo_u16 v5, exec_lo, -1         ; encoding: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x18]
@@ -1153,7 +1153,7 @@
 # GFX12: v_pk_mul_lo_u16 v5, -1, exec_hi         ; encoding: [0x05,0x40,0x01,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x01,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_mul_lo_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x01,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x01,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1183,7 +1183,7 @@
 # GFX12: v_pk_sub_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_sub_i16 v5, m0, 0x3800
+# GFX12: v_pk_sub_i16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_sub_i16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x18]
@@ -1198,7 +1198,7 @@
 # GFX12: v_pk_sub_i16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x03,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x03,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_sub_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_sub_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x03,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x03,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x10]
@@ -1228,7 +1228,7 @@
 # GFX12: v_pk_sub_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18]
 0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18
 
-# GFX12: v_pk_sub_u16 v5, m0, 0x3800
+# GFX12: v_pk_sub_u16 v5, m0, 0.5                ; encoding: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18]
 0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18
 
 # GFX12: v_pk_sub_u16 v5, exec_lo, -1            ; encoding: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x18]
@@ -1243,7 +1243,7 @@
 # GFX12: v_pk_sub_u16 v5, -1, exec_hi            ; encoding: [0x05,0x40,0x0b,0xcc,0xc1,0xfe,0x00,0x18]
 0x05,0x40,0x0b,0xcc,0xc1,0xfe,0x00,0x18
 
-# GFX12: v_pk_sub_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0]
+# GFX12: v_pk_sub_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0b,0xcc,0xf0,0xfa,0x00,0x00]
 0x05,0x58,0x0b,0xcc,0xf0,0xfa,0x00,0x00
 
 # GFX12: v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x10]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3p.txt
index 215453d0331d4..003ece98beeb6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3p.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3p.txt
@@ -42,10 +42,10 @@
 # CHECK: v_pk_mad_i16 v5, -1, v2, v3             ; encoding: [0x05,0x40,0x80,0xd3,0xc1,0x04,0x0e,0x1c]
 0x05,0x40,0x80,0xd3,0xc1,0x04,0x0e,0x1c
 
-# CHECK: v_pk_mad_i16 v5, 0x3800, v2, v3         ; encoding: [0x05,0x40,0x80,0xd3,0xff,0x04,0x0e,0x1c]
+# CHECK: v_pk_mad_i16 v5, 0.5, v2, v3            ; encoding: [0x05,0x40,0x80,0xd3,0xf0,0x04,0x0e,0x1c]
 0x05,0x40,0x80,0xd3,0xf0,0x04,0x0e,0x1c
 
-# CHECK: v_pk_mad_i16 v5, 0xc400, v2, v3         ; encoding: [0x05,0x40,0x80,0xd3,0xff,0x04,0x0e,0x1c]
+# CHECK: v_pk_mad_i16 v5, -4.0, v2, v3           ; encoding: [0x05,0x40,0x80,0xd3,0xf7,0x04,0x0e,0x1c]
 0x05,0x40,0x80,0xd3,0xf7,0x04,0x0e,0x1c
 
 # CHECK: v_pk_mad_i16 v5, v1, v255, v3           ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c]
@@ -84,10 +84,10 @@
 # CHECK: v_pk_mad_i16 v5, v1, -1, v3             ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x83,0x0d,0x1c]
 0x05,0x40,0x80,0xd3,0x01,0x83,0x0d,0x1c
 
-# CHECK: v_pk_mad_i16 v5, v1, 0x3800, v3         ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0d,0x1c]
+# CHECK: v_pk_mad_i16 v5, v1, 0.5, v3            ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xe1,0x0d,0x1c]
 0x05,0x40,0x80,0xd3,0x01,0xe1,0x0d,0x1c
 
-# CHECK: v_pk_mad_i16 v5, v1, 0xc400, v3         ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0d,0x1c]
+# CHECK: v_pk_mad_i16 v5, v1, -4.0, v3           ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xef,0x0d,0x1c]
 0x05,0x40,0x80,0xd3,0x01,0xef,0x0d,0x1c
 
 # CHECK: v_pk_mad_i16 v5, v1, v2, v255           ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f]
@@ -126,10 +126,10 @@
 # CHECK: v_pk_mad_i16 v5, v1, v2, -1             ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x06,0x1b]
 0x05,0x40,0x80,0xd3,0x01,0x05,0x06,0x1b
 
-# CHECK: v_pk_mad_i16 v5, v1, v2, 0x3800         ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1b]
+# CHECK: v_pk_mad_i16 v5, v1, v2, 0.5            ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xc2,0x1b]
 0x05,0x40,0x80,0xd3,0x01,0x05,0xc2,0x1b
 
-# CHECK: v_pk_mad_i16 v5, v1, v2, 0xc400         ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1b]
+# CHECK: v_pk_mad_i16 v5, v1, v2, -4.0           ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xde,0x1b]
 0x05,0x40,0x80,0xd3,0x01,0x05,0xde,0x1b
 
 # CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c]
@@ -201,10 +201,10 @@
 # CHECK: v_pk_mul_lo_u16 v5, -1, v2              ; encoding: [0x05,0x40,0x81,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x81,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_mul_lo_u16 v5, 0x3800, v2          ; encoding: [0x05,0x40,0x81,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_mul_lo_u16 v5, 0.5, v2             ; encoding: [0x05,0x40,0x81,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x81,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_mul_lo_u16 v5, 0xc400, v2          ; encoding: [0x05,0x40,0x81,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_mul_lo_u16 v5, -4.0, v2            ; encoding: [0x05,0x40,0x81,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x81,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_mul_lo_u16 v5, v1, v255            ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xff,0x03,0x18]
@@ -243,10 +243,10 @@
 # CHECK: v_pk_mul_lo_u16 v5, v1, -1              ; encoding: [0x05,0x40,0x81,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x81,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_mul_lo_u16 v5, v1, 0x3800          ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_mul_lo_u16 v5, v1, 0.5             ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x81,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_mul_lo_u16 v5, v1, 0xc400          ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_mul_lo_u16 v5, v1, -4.0            ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x81,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_mul_lo_u16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x81,0xd3,0x01,0x05,0x02,0x18]
@@ -309,10 +309,10 @@
 # CHECK: v_pk_add_i16 v5, -1, v2                 ; encoding: [0x05,0x40,0x82,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x82,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_add_i16 v5, 0x3800, v2             ; encoding: [0x05,0x40,0x82,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_add_i16 v5, 0.5, v2                ; encoding: [0x05,0x40,0x82,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x82,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_add_i16 v5, 0xc400, v2             ; encoding: [0x05,0x40,0x82,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_add_i16 v5, -4.0, v2               ; encoding: [0x05,0x40,0x82,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x82,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_add_i16 v5, v1, v255               ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xff,0x03,0x18]
@@ -351,10 +351,10 @@
 # CHECK: v_pk_add_i16 v5, v1, -1                 ; encoding: [0x05,0x40,0x82,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x82,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_add_i16 v5, v1, 0x3800             ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_add_i16 v5, v1, 0.5                ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x82,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_add_i16 v5, v1, 0xc400             ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_add_i16 v5, v1, -4.0               ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x82,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_add_i16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x48,0x82,0xd3,0x01,0x05,0x02,0x18]
@@ -420,10 +420,10 @@
 # CHECK: v_pk_sub_i16 v5, -1, v2                 ; encoding: [0x05,0x40,0x83,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x83,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_sub_i16 v5, 0x3800, v2             ; encoding: [0x05,0x40,0x83,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_sub_i16 v5, 0.5, v2                ; encoding: [0x05,0x40,0x83,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x83,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_sub_i16 v5, 0xc400, v2             ; encoding: [0x05,0x40,0x83,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_sub_i16 v5, -4.0, v2               ; encoding: [0x05,0x40,0x83,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x83,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_sub_i16 v5, v1, v255               ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xff,0x03,0x18]
@@ -462,10 +462,10 @@
 # CHECK: v_pk_sub_i16 v5, v1, -1                 ; encoding: [0x05,0x40,0x83,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x83,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_sub_i16 v5, v1, 0x3800             ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_sub_i16 v5, v1, 0.5                ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x83,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_sub_i16 v5, v1, 0xc400             ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_sub_i16 v5, v1, -4.0               ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x83,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_sub_i16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x48,0x83,0xd3,0x01,0x05,0x02,0x18]
@@ -531,10 +531,10 @@
 # CHECK: v_pk_lshlrev_b16 v5, -1, v2             ; encoding: [0x05,0x40,0x84,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x84,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_lshlrev_b16 v5, 0x3800, v2         ; encoding: [0x05,0x40,0x84,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_lshlrev_b16 v5, 0.5, v2            ; encoding: [0x05,0x40,0x84,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x84,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_lshlrev_b16 v5, 0xc400, v2         ; encoding: [0x05,0x40,0x84,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_lshlrev_b16 v5, -4.0, v2           ; encoding: [0x05,0x40,0x84,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x84,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_lshlrev_b16 v5, v1, v255           ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xff,0x03,0x18]
@@ -573,10 +573,10 @@
 # CHECK: v_pk_lshlrev_b16 v5, v1, -1             ; encoding: [0x05,0x40,0x84,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x84,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_lshlrev_b16 v5, v1, 0x3800         ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_lshlrev_b16 v5, v1, 0.5            ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x84,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_lshlrev_b16 v5, v1, 0xc400         ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_lshlrev_b16 v5, v1, -4.0           ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x84,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_lshlrev_b16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x84,0xd3,0x01,0x05,0x02,0x18]
@@ -639,10 +639,10 @@
 # CHECK: v_pk_lshrrev_b16 v5, -1, v2             ; encoding: [0x05,0x40,0x85,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x85,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_lshrrev_b16 v5, 0x3800, v2         ; encoding: [0x05,0x40,0x85,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_lshrrev_b16 v5, 0.5, v2            ; encoding: [0x05,0x40,0x85,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x85,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_lshrrev_b16 v5, 0xc400, v2         ; encoding: [0x05,0x40,0x85,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_lshrrev_b16 v5, -4.0, v2           ; encoding: [0x05,0x40,0x85,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x85,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_lshrrev_b16 v5, v1, v255           ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xff,0x03,0x18]
@@ -681,10 +681,10 @@
 # CHECK: v_pk_lshrrev_b16 v5, v1, -1             ; encoding: [0x05,0x40,0x85,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x85,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_lshrrev_b16 v5, v1, 0x3800         ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_lshrrev_b16 v5, v1, 0.5            ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x85,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_lshrrev_b16 v5, v1, 0xc400         ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_lshrrev_b16 v5, v1, -4.0           ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x85,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_lshrrev_b16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x85,0xd3,0x01,0x05,0x02,0x18]
@@ -747,10 +747,10 @@
 # CHECK: v_pk_ashrrev_i16 v5, -1, v2             ; encoding: [0x05,0x40,0x86,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x86,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_ashrrev_i16 v5, 0x3800, v2         ; encoding: [0x05,0x40,0x86,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_ashrrev_i16 v5, 0.5, v2            ; encoding: [0x05,0x40,0x86,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x86,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_ashrrev_i16 v5, 0xc400, v2         ; encoding: [0x05,0x40,0x86,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_ashrrev_i16 v5, -4.0, v2           ; encoding: [0x05,0x40,0x86,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x86,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_ashrrev_i16 v5, v1, v255           ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xff,0x03,0x18]
@@ -789,10 +789,10 @@
 # CHECK: v_pk_ashrrev_i16 v5, v1, -1             ; encoding: [0x05,0x40,0x86,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x86,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_ashrrev_i16 v5, v1, 0x3800         ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_ashrrev_i16 v5, v1, 0.5            ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x86,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_ashrrev_i16 v5, v1, 0xc400         ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_ashrrev_i16 v5, v1, -4.0           ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x86,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_ashrrev_i16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x86,0xd3,0x01,0x05,0x02,0x18]
@@ -855,10 +855,10 @@
 # CHECK: v_pk_max_i16 v5, -1, v2                 ; encoding: [0x05,0x40,0x87,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x87,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_max_i16 v5, 0x3800, v2             ; encoding: [0x05,0x40,0x87,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_max_i16 v5, 0.5, v2                ; encoding: [0x05,0x40,0x87,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x87,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_max_i16 v5, 0xc400, v2             ; encoding: [0x05,0x40,0x87,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_max_i16 v5, -4.0, v2               ; encoding: [0x05,0x40,0x87,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x87,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_max_i16 v5, v1, v255               ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xff,0x03,0x18]
@@ -897,10 +897,10 @@
 # CHECK: v_pk_max_i16 v5, v1, -1                 ; encoding: [0x05,0x40,0x87,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x87,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_max_i16 v5, v1, 0x3800             ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_max_i16 v5, v1, 0.5                ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x87,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_max_i16 v5, v1, 0xc400             ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_max_i16 v5, v1, -4.0               ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x87,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_max_i16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x48,0x87,0xd3,0x01,0x05,0x02,0x18]
@@ -963,10 +963,10 @@
 # CHECK: v_pk_min_i16 v5, -1, v2                 ; encoding: [0x05,0x40,0x88,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x88,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_min_i16 v5, 0x3800, v2             ; encoding: [0x05,0x40,0x88,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_min_i16 v5, 0.5, v2                ; encoding: [0x05,0x40,0x88,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x88,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_min_i16 v5, 0xc400, v2             ; encoding: [0x05,0x40,0x88,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_min_i16 v5, -4.0, v2               ; encoding: [0x05,0x40,0x88,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x88,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_min_i16 v5, v1, v255               ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xff,0x03,0x18]
@@ -1005,10 +1005,10 @@
 # CHECK: v_pk_min_i16 v5, v1, -1                 ; encoding: [0x05,0x40,0x88,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x88,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_min_i16 v5, v1, 0x3800             ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_min_i16 v5, v1, 0.5                ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x88,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_min_i16 v5, v1, 0xc400             ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_min_i16 v5, v1, -4.0               ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x88,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_min_i16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x48,0x88,0xd3,0x01,0x05,0x02,0x18]
@@ -1071,10 +1071,10 @@
 # CHECK: v_pk_mad_u16 v5, -1, v2, v3             ; encoding: [0x05,0x40,0x89,0xd3,0xc1,0x04,0x0e,0x1c]
 0x05,0x40,0x89,0xd3,0xc1,0x04,0x0e,0x1c
 
-# CHECK: v_pk_mad_u16 v5, 0x3800, v2, v3         ; encoding: [0x05,0x40,0x89,0xd3,0xff,0x04,0x0e,0x1c]
+# CHECK: v_pk_mad_u16 v5, 0.5, v2, v3            ; encoding: [0x05,0x40,0x89,0xd3,0xf0,0x04,0x0e,0x1c]
 0x05,0x40,0x89,0xd3,0xf0,0x04,0x0e,0x1c
 
-# CHECK: v_pk_mad_u16 v5, 0xc400, v2, v3         ; encoding: [0x05,0x40,0x89,0xd3,0xff,0x04,0x0e,0x1c]
+# CHECK: v_pk_mad_u16 v5, -4.0, v2, v3           ; encoding: [0x05,0x40,0x89,0xd3,0xf7,0x04,0x0e,0x1c]
 0x05,0x40,0x89,0xd3,0xf7,0x04,0x0e,0x1c
 
 # CHECK: v_pk_mad_u16 v5, v1, v255, v3           ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c]
@@ -1113,10 +1113,10 @@
 # CHECK: v_pk_mad_u16 v5, v1, -1, v3             ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x83,0x0d,0x1c]
 0x05,0x40,0x89,0xd3,0x01,0x83,0x0d,0x1c
 
-# CHECK: v_pk_mad_u16 v5, v1, 0x3800, v3         ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0d,0x1c]
+# CHECK: v_pk_mad_u16 v5, v1, 0.5, v3            ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xe1,0x0d,0x1c]
 0x05,0x40,0x89,0xd3,0x01,0xe1,0x0d,0x1c
 
-# CHECK: v_pk_mad_u16 v5, v1, 0xc400, v3         ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0d,0x1c]
+# CHECK: v_pk_mad_u16 v5, v1, -4.0, v3           ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xef,0x0d,0x1c]
 0x05,0x40,0x89,0xd3,0x01,0xef,0x0d,0x1c
 
 # CHECK: v_pk_mad_u16 v5, v1, v2, v255           ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f]
@@ -1155,10 +1155,10 @@
 # CHECK: v_pk_mad_u16 v5, v1, v2, -1             ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x06,0x1b]
 0x05,0x40,0x89,0xd3,0x01,0x05,0x06,0x1b
 
-# CHECK: v_pk_mad_u16 v5, v1, v2, 0x3800         ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1b]
+# CHECK: v_pk_mad_u16 v5, v1, v2, 0.5            ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xc2,0x1b]
 0x05,0x40,0x89,0xd3,0x01,0x05,0xc2,0x1b
 
-# CHECK: v_pk_mad_u16 v5, v1, v2, 0xc400         ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1b]
+# CHECK: v_pk_mad_u16 v5, v1, v2, -4.0           ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xde,0x1b]
 0x05,0x40,0x89,0xd3,0x01,0x05,0xde,0x1b
 
 # CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c]
@@ -1230,10 +1230,10 @@
 # CHECK: v_pk_add_u16 v5, -1, v2                 ; encoding: [0x05,0x40,0x8a,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x8a,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_add_u16 v5, 0x3800, v2             ; encoding: [0x05,0x40,0x8a,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_add_u16 v5, 0.5, v2                ; encoding: [0x05,0x40,0x8a,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x8a,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_add_u16 v5, 0xc400, v2             ; encoding: [0x05,0x40,0x8a,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_add_u16 v5, -4.0, v2               ; encoding: [0x05,0x40,0x8a,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x8a,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_add_u16 v5, v1, v255               ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xff,0x03,0x18]
@@ -1272,10 +1272,10 @@
 # CHECK: v_pk_add_u16 v5, v1, -1                 ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x8a,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_add_u16 v5, v1, 0x3800             ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_add_u16 v5, v1, 0.5                ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x8a,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_add_u16 v5, v1, 0xc400             ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_add_u16 v5, v1, -4.0               ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x8a,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_add_u16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x48,0x8a,0xd3,0x01,0x05,0x02,0x18]
@@ -1341,10 +1341,10 @@
 # CHECK: v_pk_sub_u16 v5, -1, v2                 ; encoding: [0x05,0x40,0x8b,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x8b,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_sub_u16 v5, 0x3800, v2             ; encoding: [0x05,0x40,0x8b,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_sub_u16 v5, 0.5, v2                ; encoding: [0x05,0x40,0x8b,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x8b,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_sub_u16 v5, 0xc400, v2             ; encoding: [0x05,0x40,0x8b,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_sub_u16 v5, -4.0, v2               ; encoding: [0x05,0x40,0x8b,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x8b,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_sub_u16 v5, v1, v255               ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xff,0x03,0x18]
@@ -1383,10 +1383,10 @@
 # CHECK: v_pk_sub_u16 v5, v1, -1                 ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x8b,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_sub_u16 v5, v1, 0x3800             ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_sub_u16 v5, v1, 0.5                ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x8b,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_sub_u16 v5, v1, 0xc400             ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_sub_u16 v5, v1, -4.0               ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x8b,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x48,0x8b,0xd3,0x01,0x05,0x02,0x18]
@@ -1452,10 +1452,10 @@
 # CHECK: v_pk_max_u16 v5, -1, v2                 ; encoding: [0x05,0x40,0x8c,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x8c,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_max_u16 v5, 0x3800, v2             ; encoding: [0x05,0x40,0x8c,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_max_u16 v5, 0.5, v2                ; encoding: [0x05,0x40,0x8c,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x8c,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_max_u16 v5, 0xc400, v2             ; encoding: [0x05,0x40,0x8c,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_max_u16 v5, -4.0, v2               ; encoding: [0x05,0x40,0x8c,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x8c,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_max_u16 v5, v1, v255               ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xff,0x03,0x18]
@@ -1494,10 +1494,10 @@
 # CHECK: v_pk_max_u16 v5, v1, -1                 ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x8c,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_max_u16 v5, v1, 0x3800             ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_max_u16 v5, v1, 0.5                ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x8c,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_max_u16 v5, v1, 0xc400             ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_max_u16 v5, v1, -4.0               ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x8c,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_max_u16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x48,0x8c,0xd3,0x01,0x05,0x02,0x18]
@@ -1560,10 +1560,10 @@
 # CHECK: v_pk_min_u16 v5, -1, v2                 ; encoding: [0x05,0x40,0x8d,0xd3,0xc1,0x04,0x02,0x18]
 0x05,0x00,0x8d,0xd3,0xc1,0x04,0x02,0x18
 
-# CHECK: v_pk_min_u16 v5, 0x3800, v2             ; encoding: [0x05,0x40,0x8d,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_min_u16 v5, 0.5, v2                ; encoding: [0x05,0x40,0x8d,0xd3,0xf0,0x04,0x02,0x18]
 0x05,0x00,0x8d,0xd3,0xf0,0x04,0x02,0x18
 
-# CHECK: v_pk_min_u16 v5, 0xc400, v2             ; encoding: [0x05,0x40,0x8d,0xd3,0xff,0x04,0x02,0x18]
+# CHECK: v_pk_min_u16 v5, -4.0, v2               ; encoding: [0x05,0x40,0x8d,0xd3,0xf7,0x04,0x02,0x18]
 0x05,0x00,0x8d,0xd3,0xf7,0x04,0x02,0x18
 
 # CHECK: v_pk_min_u16 v5, v1, v255               ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xff,0x03,0x18]
@@ -1602,10 +1602,10 @@
 # CHECK: v_pk_min_u16 v5, v1, -1                 ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0x83,0x01,0x18]
 0x05,0x00,0x8d,0xd3,0x01,0x83,0x01,0x18
 
-# CHECK: v_pk_min_u16 v5, v1, 0x3800             ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_min_u16 v5, v1, 0.5                ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xe1,0x01,0x18]
 0x05,0x00,0x8d,0xd3,0x01,0xe1,0x01,0x18
 
-# CHECK: v_pk_min_u16 v5, v1, 0xc400             ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xff,0x01,0x18]
+# CHECK: v_pk_min_u16 v5, v1, -4.0               ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xef,0x01,0x18]
 0x05,0x00,0x8d,0xd3,0x01,0xef,0x01,0x18
 
 # CHECK: v_pk_min_u16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x48,0x8d,0xd3,0x01,0x05,0x02,0x18]

From 3f2e670671e718579cf98f1bf0087b6d0ee4919c Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Wed, 3 Jan 2024 15:16:08 -0800
Subject: [PATCH 177/313] Revert "[Clang][Sema] Diagnose unexpanded packs in
 the template argument lists of function template specializations" (#76876)

Reverts llvm/llvm-project#76677

See discussion here: https://github.com/llvm/llvm-project/pull/76677
---
 clang/docs/ReleaseNotes.rst                   |  1 -
 clang/lib/Sema/SemaDecl.cpp                   | 89 ++++++++++---------
 .../CXX/temp/temp.decls/temp.variadic/p5.cpp  | 12 ---
 3 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 778ce0e0e52d0..a3107c4a69532 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -518,7 +518,6 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses definitions of friend function specializations, e.g. ``friend void f<>(int) {}``.
 - Clang now diagnoses narrowing conversions involving const references.
   (`#63151: <https://github.com/llvm/llvm-project/issues/63151>`_).
-- Clang now diagnoses unexpanded packs within the template argument lists of function template specializations.
 
 
 Improvements to Clang's time-trace
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 8e46c4984d93d..2de631941325f 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9900,15 +9900,15 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     // Match up the template parameter lists with the scope specifier, then
     // determine whether we have a template or a template specialization.
     bool Invalid = false;
-    TemplateIdAnnotation *TemplateId =
-        D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId
-            ? D.getName().TemplateId
-            : nullptr;
     TemplateParameterList *TemplateParams =
         MatchTemplateParametersToScopeSpecifier(
             D.getDeclSpec().getBeginLoc(), D.getIdentifierLoc(),
-            D.getCXXScopeSpec(), TemplateId, TemplateParamLists, isFriend,
-            isMemberSpecialization, Invalid);
+            D.getCXXScopeSpec(),
+            D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId
+                ? D.getName().TemplateId
+                : nullptr,
+            TemplateParamLists, isFriend, isMemberSpecialization,
+            Invalid);
     if (TemplateParams) {
       // Check that we can declare a template here.
       if (CheckTemplateDeclScope(S, TemplateParams))
@@ -9921,11 +9921,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
         if (Name.getNameKind() == DeclarationName::CXXDestructorName) {
           Diag(NewFD->getLocation(), diag::err_destructor_template);
           NewFD->setInvalidDecl();
-          // Function template with explicit template arguments.
-        } else if (TemplateId) {
-          Diag(D.getIdentifierLoc(), diag::err_function_template_partial_spec)
-              << SourceRange(TemplateId->LAngleLoc, TemplateId->RAngleLoc);
-          NewFD->setInvalidDecl();
         }
 
         // If we're adding a template to a dependent context, we may need to
@@ -9978,11 +9973,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
             << FixItHint::CreateRemoval(RemoveRange)
             << FixItHint::CreateInsertion(InsertLoc, "<>");
           Invalid = true;
-
-          // Recover by faking up an empty template argument list.
-          HasExplicitTemplateArgs = true;
-          TemplateArgs.setLAngleLoc(InsertLoc);
-          TemplateArgs.setRAngleLoc(InsertLoc);
         }
       }
     } else {
@@ -9996,33 +9986,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
       if (TemplateParamLists.size() > 0)
         // For source fidelity, store all the template param lists.
         NewFD->setTemplateParameterListsInfo(Context, TemplateParamLists);
-
-      // "friend void foo<>(int);" is an implicit specialization decl.
-      if (isFriend && TemplateId)
-        isFunctionTemplateSpecialization = true;
-    }
-
-    // If this is a function template specialization and the unqualified-id of
-    // the declarator-id is a template-id, convert the template argument list
-    // into our AST format and check for unexpanded packs.
-    if (isFunctionTemplateSpecialization && TemplateId) {
-      HasExplicitTemplateArgs = true;
-
-      TemplateArgs.setLAngleLoc(TemplateId->LAngleLoc);
-      TemplateArgs.setRAngleLoc(TemplateId->RAngleLoc);
-      ASTTemplateArgsPtr TemplateArgsPtr(TemplateId->getTemplateArgs(),
-                                         TemplateId->NumArgs);
-      translateTemplateArguments(TemplateArgsPtr, TemplateArgs);
-
-      // FIXME: Should we check for unexpanded packs if this was an (invalid)
-      // declaration of a function template partial specialization? Should we
-      // consider the unexpanded pack context to be a partial specialization?
-      for (const TemplateArgumentLoc &ArgLoc : TemplateArgs.arguments()) {
-        if (DiagnoseUnexpandedParameterPack(
-                ArgLoc, isFriend ? UPPC_FriendDeclaration
-                                 : UPPC_ExplicitSpecialization))
-          NewFD->setInvalidDecl();
-      }
     }
 
     if (Invalid) {
@@ -10475,6 +10438,46 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
            diag::ext_operator_new_delete_declared_inline)
         << NewFD->getDeclName();
 
+    // If the declarator is a template-id, translate the parser's template
+    // argument list into our AST format.
+    if (D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId) {
+      TemplateIdAnnotation *TemplateId = D.getName().TemplateId;
+      TemplateArgs.setLAngleLoc(TemplateId->LAngleLoc);
+      TemplateArgs.setRAngleLoc(TemplateId->RAngleLoc);
+      ASTTemplateArgsPtr TemplateArgsPtr(TemplateId->getTemplateArgs(),
+                                         TemplateId->NumArgs);
+      translateTemplateArguments(TemplateArgsPtr,
+                                 TemplateArgs);
+
+      HasExplicitTemplateArgs = true;
+
+      if (NewFD->isInvalidDecl()) {
+        HasExplicitTemplateArgs = false;
+      } else if (FunctionTemplate) {
+        // Function template with explicit template arguments.
+        Diag(D.getIdentifierLoc(), diag::err_function_template_partial_spec)
+          << SourceRange(TemplateId->LAngleLoc, TemplateId->RAngleLoc);
+
+        HasExplicitTemplateArgs = false;
+      } else if (isFriend) {
+        // "friend void foo<>(int);" is an implicit specialization decl.
+        isFunctionTemplateSpecialization = true;
+      } else {
+        assert(isFunctionTemplateSpecialization &&
+               "should have a 'template<>' for this decl");
+      }
+    } else if (isFriend && isFunctionTemplateSpecialization) {
+      // This combination is only possible in a recovery case;  the user
+      // wrote something like:
+      //   template <> friend void foo(int);
+      // which we're recovering from as if the user had written:
+      //   friend void foo<>(int);
+      // Go ahead and fake up a template id.
+      HasExplicitTemplateArgs = true;
+      TemplateArgs.setLAngleLoc(D.getIdentifierLoc());
+      TemplateArgs.setRAngleLoc(D.getIdentifierLoc());
+    }
+
     // We do not add HD attributes to specializations here because
     // they may have different constexpr-ness compared to their
     // templates and, after maybeAddCUDAHostDeviceAttrs() is applied,
diff --git a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
index 3c500c2c4dc4a..30ce6b40e1fb5 100644
--- a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
+++ b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
@@ -376,11 +376,6 @@ namespace Specializations {
   template<typename... Ts>
   struct PrimaryClass<Ts>; // expected-error{{partial specialization contains unexpanded parameter pack 'Ts'}}
 
-  template<typename T, typename... Ts>
-  void PrimaryFunction();
-  template<typename T, typename... Ts>
-  void PrimaryFunction<Ts>(); // expected-error{{function template partial specialization is not allowed}}
-
 #if __cplusplus >= 201402L
   template<typename T, typename... Ts>
   constexpr int PrimaryVar = 0;
@@ -397,13 +392,6 @@ namespace Specializations {
     template<typename U>
     struct InnerClass<U, Ts>; // expected-error{{partial specialization contains unexpanded parameter pack 'Ts'}}
 
-    template<typename... Us>
-    void InnerFunction();
-    template<>
-    void InnerFunction<Ts>(); // expected-error{{explicit specialization contains unexpanded parameter pack 'Ts'}}
-
-    friend void PrimaryFunction<Ts>(); // expected-error{{friend declaration contains unexpanded parameter pack 'Ts'}}
-
 #if __cplusplus >= 201402L
     template<typename... Us>
     constexpr static int InnerVar = 0;

From a25b66217f41564f933dd1d6d2993d990615c47b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Wed, 3 Jan 2024 15:22:42 -0800
Subject: [PATCH 178/313] [NFC][llvm-exegesis] Remove redundant register
 initial values argument

This patch removes the redundant RegisterInitialValues parameter from
assembleToStream and friends as it is included within the BenchmarkKey
struct that is also passed to all the functions that need this
information.
---
 llvm/tools/llvm-exegesis/lib/Assembler.cpp    | 24 +++++++++----------
 llvm/tools/llvm-exegesis/lib/Assembler.h      |  6 ++---
 .../llvm-exegesis/lib/BenchmarkRunner.cpp     |  1 -
 .../llvm-exegesis/Common/AssemblerUtils.h     |  3 +--
 4 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.cpp b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
index 96b5a068ff21f..9f03a4e3a5a6f 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
@@ -49,10 +49,11 @@ static const Align kFunctionAlignment(4096);
 
 // Fills the given basic block with register setup code, and returns true if
 // all registers could be setup correctly.
-static bool generateSnippetSetupCode(
-    const ExegesisTarget &ET, const MCSubtargetInfo *const MSI,
-    ArrayRef<RegisterValue> RegisterInitialValues, BasicBlockFiller &BBF,
-    const BenchmarkKey &Key, bool GenerateMemoryInstructions) {
+static bool generateSnippetSetupCode(const ExegesisTarget &ET,
+                                     const MCSubtargetInfo *const MSI,
+                                     BasicBlockFiller &BBF,
+                                     const BenchmarkKey &Key,
+                                     bool GenerateMemoryInstructions) {
   bool IsSnippetSetupComplete = true;
   if (GenerateMemoryInstructions) {
     BBF.addInstructions(ET.generateMemoryInitialSetup());
@@ -75,7 +76,7 @@ static bool generateSnippetSetupCode(
   Register StackPointerRegister = BBF.MF.getSubtarget()
                                       .getTargetLowering()
                                       ->getStackPointerRegisterToSaveRestore();
-  for (const RegisterValue &RV : RegisterInitialValues) {
+  for (const RegisterValue &RV : Key.RegisterInitialValues) {
     if (GenerateMemoryInstructions) {
       // If we're generating memory instructions, don't load in the value for
       // the register with the stack pointer as it will be used later to finish
@@ -93,7 +94,7 @@ static bool generateSnippetSetupCode(
 #ifdef HAVE_LIBPFM
     BBF.addInstructions(ET.configurePerfCounter(PERF_EVENT_IOC_RESET, true));
 #endif // HAVE_LIBPFM
-    for (const RegisterValue &RV : RegisterInitialValues) {
+    for (const RegisterValue &RV : Key.RegisterInitialValues) {
       // Load in the stack register now as we're done using it elsewhere
       // and need to set the value in preparation for executing the
       // snippet.
@@ -242,10 +243,8 @@ BitVector getFunctionReservedRegs(const TargetMachine &TM) {
 
 Error assembleToStream(const ExegesisTarget &ET,
                        std::unique_ptr<LLVMTargetMachine> TM,
-                       ArrayRef<unsigned> LiveIns,
-                       ArrayRef<RegisterValue> RegisterInitialValues,
-                       const FillFunction &Fill, raw_pwrite_stream &AsmStream,
-                       const BenchmarkKey &Key,
+                       ArrayRef<unsigned> LiveIns, const FillFunction &Fill,
+                       raw_pwrite_stream &AsmStream, const BenchmarkKey &Key,
                        bool GenerateMemoryInstructions) {
   auto Context = std::make_unique<LLVMContext>();
   std::unique_ptr<Module> Module =
@@ -275,7 +274,7 @@ Error assembleToStream(const ExegesisTarget &ET,
   }
 
   std::vector<unsigned> RegistersSetUp;
-  for (const auto &InitValue : RegisterInitialValues) {
+  for (const auto &InitValue : Key.RegisterInitialValues) {
     RegistersSetUp.push_back(InitValue.Register);
   }
   FunctionFiller Sink(MF, std::move(RegistersSetUp));
@@ -294,8 +293,7 @@ Error assembleToStream(const ExegesisTarget &ET,
   }
 
   const bool IsSnippetSetupComplete = generateSnippetSetupCode(
-      ET, TM->getMCSubtargetInfo(), RegisterInitialValues, Entry, Key,
-      GenerateMemoryInstructions);
+      ET, TM->getMCSubtargetInfo(), Entry, Key, GenerateMemoryInstructions);
 
   // If the snippet setup is not complete, we disable liveliness tracking. This
   // means that we won't know what values are in the registers.
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.h b/llvm/tools/llvm-exegesis/lib/Assembler.h
index abc5aa7be8cfe..d85d7fdcf04f5 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.h
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.h
@@ -90,10 +90,8 @@ using FillFunction = std::function<void(FunctionFiller &)>;
 // AsmStream, the temporary function is eventually discarded.
 Error assembleToStream(const ExegesisTarget &ET,
                        std::unique_ptr<LLVMTargetMachine> TM,
-                       ArrayRef<unsigned> LiveIns,
-                       ArrayRef<RegisterValue> RegisterInitialValues,
-                       const FillFunction &Fill, raw_pwrite_stream &AsmStreamm,
-                       const BenchmarkKey &Key,
+                       ArrayRef<unsigned> LiveIns, const FillFunction &Fill,
+                       raw_pwrite_stream &AsmStreamm, const BenchmarkKey &Key,
                        bool GenerateMemoryInstructions);
 
 // Creates an ObjectFile in the format understood by the host.
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 1ee59a86ebbdc..5f08c67bfc89a 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -484,7 +484,6 @@ Expected<SmallString<0>> BenchmarkRunner::assembleSnippet(
   raw_svector_ostream OS(Buffer);
   if (Error E = assembleToStream(
           State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns,
-          BC.Key.RegisterInitialValues,
           Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize,
                            GenerateMemoryInstructions),
           OS, BC.Key, GenerateMemoryInstructions)) {
diff --git a/llvm/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h b/llvm/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
index 2804a6e69e824..9cf63931e6dd5 100644
--- a/llvm/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
+++ b/llvm/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
@@ -81,8 +81,7 @@ class MachineFunctionGeneratorBaseTest : public ::testing::Test {
     BenchmarkKey Key;
     Key.RegisterInitialValues = RegisterInitialValues;
     EXPECT_FALSE(assembleToStream(*ET, createTargetMachine(), /*LiveIns=*/{},
-                                  RegisterInitialValues, Fill, AsmStream, Key,
-                                  false));
+                                  Fill, AsmStream, Key, false));
     Expected<ExecutableFunction> ExecFunc = ExecutableFunction::create(
         createTargetMachine(), getObjectFromBuffer(AsmStream.str()));
 

From 428cf71ffa01c579e0ebf66b7d74eb9cb8d8f3f8 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Thu, 21 Dec 2023 14:56:03 -0800
Subject: [PATCH 179/313] Reland "[WebAssembly][Object]Use file offset as
 function symbol address for linked files (#76198)"

WebAssembly doesn't have a single virtual memory space the way other object
formats or architectures do, so "addresses" mean different things depending
on the context.
Function symbol addresses in object files are offsets from the start of the code
section. This is good for linking and relocation. However when dealing with
linked binaries, offsets from the start of the file/module are more often
used (e.g. for stack traces in browsers), and are more useful for use
cases like binary size attribution. This PR changes Object to use
the file offset instead of the section offset for function symbols, but
only for linked (non-DSO) files.

This is a reland of fc5f51cf with a fix for the MSan failure (it was not caused
by this change, but it was revealed by the new tests).
---
 llvm/lib/Object/WasmObjectFile.cpp            | 17 ++++-
 llvm/test/tools/llvm-nm/wasm/linked.yaml      | 74 ++++++++++++++++++
 .../wasm/linked-symbol-table.yaml             | 75 +++++++++++++++++++
 3 files changed, 162 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/tools/llvm-nm/wasm/linked.yaml
 create mode 100644 llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml

diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 40665d686cf93..94cd96968ff20 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -1351,6 +1351,7 @@ Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
       break;
     case wasm::WASM_EXTERNAL_TABLE:
       Info.Kind = wasm::WASM_SYMBOL_TYPE_TABLE;
+      Info.ElementIndex = Ex.Index;
       break;
     default:
       return make_error<GenericBinaryError>("unexpected export kind",
@@ -1667,10 +1668,18 @@ Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
 Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
   auto &Sym = getWasmSymbol(Symb);
   if (Sym.Info.Kind == wasm::WASM_SYMBOL_TYPE_FUNCTION &&
-      isDefinedFunctionIndex(Sym.Info.ElementIndex))
-    return getDefinedFunction(Sym.Info.ElementIndex).CodeSectionOffset;
-  else
-    return getSymbolValue(Symb);
+      isDefinedFunctionIndex(Sym.Info.ElementIndex)) {
+    // For object files, use the section offset. The linker relies on this.
+    // For linked files, use the file offset. This behavior matches the way
+    // browsers print stack traces and is useful for binary size analysis.
+    // (see https://webassembly.github.io/spec/web-api/index.html#conventions)
+    uint32_t Adjustment = isRelocatableObject() || isSharedObject()
+                              ? 0
+                              : Sections[CodeSection].Offset;
+    return getDefinedFunction(Sym.Info.ElementIndex).CodeSectionOffset +
+           Adjustment;
+  }
+  return getSymbolValue(Symb);
 }
 
 uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const {
diff --git a/llvm/test/tools/llvm-nm/wasm/linked.yaml b/llvm/test/tools/llvm-nm/wasm/linked.yaml
new file mode 100644
index 0000000000000..992c1811743b7
--- /dev/null
+++ b/llvm/test/tools/llvm-nm/wasm/linked.yaml
@@ -0,0 +1,74 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-nm %t.wasm | FileCheck %s
+
+# CHECK: 0000009f T my_func_export
+# CHECK-NEXT: 0000002a D my_global_export
+# CHECK-NEXT: 00000000 D my_table_export
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            TABLE
+    Tables:
+      - Index:           0
+        ElemType:        FUNCREF
+        Limits:
+          Flags:           [ HAS_MAX ]
+          Minimum:         0x1
+          Maximum:         0x1
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+      - Name:            my_table_export
+        Kind:            TABLE
+        Index:           0
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         ''
diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
new file mode 100644
index 0000000000000..6dd949a441496
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
@@ -0,0 +1,75 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-objdump -t %t.wasm | FileCheck %s
+#
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 0000009f g F CODE my_func_export
+# CHECK-NEXT: 0000002a g O DATA my_global_export
+# CHECK-NEXT: 00000000 g   TABLE my_table_export
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            TABLE
+    Tables:
+      - Index:           0
+        ElemType:        FUNCREF
+        Limits:
+          Flags:           [ HAS_MAX ]
+          Minimum:         0x1
+          Maximum:         0x1
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+      - Name:            my_table_export
+        Kind:            TABLE
+        Index:           0
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         ''

From 7a4c49756db161ebcce08c7bc860a569aad7f276 Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <boian.petkantchin@amd.com>
Date: Wed, 3 Jan 2024 15:47:11 -0800
Subject: [PATCH 180/313] [mlir][mesh] Use one type for mesh axis (#76830)

Make all ops and attributes use the types MeshAxis and MeshAxesAttr
instead of int16_t, int32_t, DenseI16ArrayAttr and DenseI32ArrayAttr.
---
 mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td | 16 +++++++-------
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h   | 12 +++++++---
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td  |  4 ++--
 .../Mesh/Interfaces/ShardingInterface.h       |  4 ++--
 mlir/lib/Dialect/Mesh/IR/MeshOps.cpp          | 18 +++++++--------
 .../Mesh/Interfaces/ShardingInterface.cpp     | 22 +++++++++----------
 .../Dialect/Mesh/Transforms/Spmdization.cpp   | 20 ++++++++---------
 7 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
index a9d30dfbb9a76..060d54b82efa6 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
@@ -80,8 +80,8 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
 
   let parameters = (ins
     AttrParameter<"::mlir::SymbolRefAttr", "cluster placed">:$cluster,
-    ArrayRefParameter<"::mlir::DenseI32ArrayAttr">:$split_axes,
-    OptionalArrayRefParameter<"int32_t">:$partial_axes,
+    ArrayRefParameter<"MeshAxesAttr">:$split_axes,
+    OptionalArrayRefParameter<"MeshAxis">:$partial_axes,
     OptionalParameter<"::mlir::mesh::Partial">:$partial_type
   );
 
@@ -146,18 +146,18 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
 
   let builders = [
     AttrBuilder<(ins "SymbolRefAttr":$cluster, 
-                     "ArrayRef<SmallVector<int32_t>>":$split_axes,
-                     "ArrayRef<int32_t>": $partial_axes,
+                     "ArrayRef<SmallVector<MeshAxis>>":$split_axes,
+                     "ArrayRef<MeshAxis>": $partial_axes,
                      "mesh::Partial": $partial_type), [{
-      SmallVector<DenseI32ArrayAttr> splitAxesAttr = llvm::map_to_vector(
-                  split_axes, [&](ArrayRef<int32_t> array) {
-          return DenseI32ArrayAttr::get($_ctxt, array);
+      SmallVector<MeshAxesAttr> splitAxesAttr = llvm::map_to_vector(
+                  split_axes, [&](ArrayRef<MeshAxis> array) {
+          return MeshAxesAttr::get($_ctxt, array);
       });
       return $_get($_ctxt, cluster, splitAxesAttr, partial_axes,
                    partial_type);
     }]>,
     AttrBuilder<(ins "SymbolRefAttr":$cluster, 
-                     "ArrayRef<SmallVector<int32_t>>":$split_axes), [{
+                     "ArrayRef<SmallVector<MeshAxis>>":$split_axes), [{
       return MeshShardingAttr::get($_ctxt, cluster, split_axes, {}, Partial::Sum);
     }]>
   ];
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
index ce7d5d045122d..83452dcc2e8ab 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
@@ -17,6 +17,15 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include <algorithm>
 
+namespace mlir {
+namespace mesh {
+
+using MeshAxis = int16_t;
+using MeshAxesAttr = DenseI16ArrayAttr;
+
+} // namespace mesh
+} // namespace mlir
+
 #include "mlir/Dialect/Mesh/IR/MeshOpsDialect.h.inc"
 
 #include "mlir/Dialect/Mesh/IR/MeshOpsEnums.h.inc"
@@ -30,9 +39,6 @@
 namespace mlir {
 namespace mesh {
 
-using MeshAxis = int16_t;
-using MeshAxesAttr = DenseI16ArrayAttr;
-
 bool isReductionLoop(IteratorType iType);
 
 bool areReductionAndPartialMatch(IteratorType iType, Partial partial);
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
index 1ed54b6519e4d..1934bdfb42705 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
@@ -114,7 +114,7 @@ def Mesh_ClusterShapeOp : Mesh_Op<"cluster_shape", [Pure, DeclareOpInterfaceMeth
 
   let builders = [
     OpBuilder<(ins "::mlir::mesh::ClusterOp":$mesh)>,
-    OpBuilder<(ins "StringRef":$mesh, "ArrayRef<int16_t>":$axes)>
+    OpBuilder<(ins "StringRef":$mesh, "ArrayRef<MeshAxis>":$axes)>
   ];
 }
 
@@ -228,7 +228,7 @@ def Mesh_ProcessIndexOp : Mesh_Op<"process_index", [Pure, DeclareOpInterfaceMeth
   }];
   let builders = [
     OpBuilder<(ins "::mlir::mesh::ClusterOp":$mesh)>,
-    OpBuilder<(ins "StringRef":$mesh, "ArrayRef<int16_t>":$axes)>
+    OpBuilder<(ins "StringRef":$mesh, "ArrayRef<MeshAxis>":$axes)>
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
index 270955a3036e8..201c0151754eb 100644
--- a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
+++ b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
@@ -18,8 +18,8 @@ class Operation;
 
 namespace mesh {
 
-using ShardingArray = SmallVector<SmallVector<int32_t>>;
-using ShardingArrayRef = ArrayRef<SmallVector<int32_t>>;
+using ShardingArray = SmallVector<SmallVector<MeshAxis>>;
+using ShardingArrayRef = ArrayRef<SmallVector<MeshAxis>>;
 
 struct ShardingOption {
   // An array of int array. The sub-array at the i-th position signifies the
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index de4f58d54e8ca..c3d8f1d456106 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -266,15 +266,15 @@ void ClusterShapeOp::build(OpBuilder &odsBuilder, OperationState &odsState,
 
 LogicalResult
 MeshShardingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
-                         SymbolRefAttr, ArrayRef<DenseI32ArrayAttr> splitAxes,
-                         ArrayRef<int32_t> partialAxes, Partial) {
+                         SymbolRefAttr, ArrayRef<MeshAxesAttr> splitAxes,
+                         ArrayRef<MeshAxis> partialAxes, Partial) {
   // TODO: At present cluster symbol ref is not verified. This is due to the
   // difficulty in fetching the corresponding symbol op based on an attribute.
 
-  llvm::SmallSet<int32_t, 4> visitedAxes;
+  llvm::SmallSet<MeshAxis, 4> visitedAxes;
 
-  auto checkMeshAxis = [&](ArrayRef<int32_t> axesArray) -> LogicalResult {
-    for (int32_t axis : axesArray) {
+  auto checkMeshAxis = [&](ArrayRef<MeshAxis> axesArray) -> LogicalResult {
+    for (MeshAxis axis : axesArray) {
       if (axis < 0)
         return emitError() << "mesh axis is expected to be non-negative";
       if (!visitedAxes.insert(axis).second)
@@ -283,8 +283,8 @@ MeshShardingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
     return success();
   };
 
-  for (DenseI32ArrayAttr subAxes : splitAxes) {
-    ArrayRef<int32_t> subAxesArray = subAxes.asArrayRef();
+  for (MeshAxesAttr subAxes : splitAxes) {
+    ArrayRef<MeshAxis> subAxesArray = subAxes.asArrayRef();
     if (failed(checkMeshAxis(subAxesArray)))
       return failure();
   }
@@ -318,10 +318,10 @@ bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const {
 
   return llvm::all_of(llvm::make_range(getSplitAxes().begin() + minSize,
                                        getSplitAxes().end()),
-                      std::mem_fn(&DenseI32ArrayAttr::empty)) &&
+                      std::mem_fn(&MeshAxesAttr::empty)) &&
          llvm::all_of(llvm::make_range(rhs.getSplitAxes().begin() + minSize,
                                        rhs.getSplitAxes().end()),
-                      std::mem_fn(&DenseI32ArrayAttr::empty));
+                      std::mem_fn(&MeshAxesAttr::empty));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
index a6f2f435f36d6..ee885ab16b7b0 100644
--- a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
+++ b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
@@ -216,7 +216,7 @@ namespace {
 static LogicalResult fillShardingOption(Operation *op,
                                         ShardingOption &shardingOption,
                                         SymbolRefAttr cluster,
-                                        ArrayRef<int32_t> meshAxes,
+                                        ArrayRef<MeshAxis> meshAxes,
                                         unsigned loopIdx) {
   if ((shardingOption.cluster && cluster &&
        shardingOption.cluster != cluster) ||
@@ -230,7 +230,7 @@ static LogicalResult fillShardingOption(Operation *op,
     if (i == loopIdx)
       continue;
 
-    for (int32_t axis : meshAxes) {
+    for (MeshAxis axis : meshAxes) {
       if (llvm::is_contained(shardingOption.shardingArray[i], axis)) {
         LLVM_DEBUG(DBGS() << "sharding option conflicts because mesh axes "
                           << axis << " duplicate");
@@ -260,7 +260,7 @@ FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(
   SmallVector<AffineMap> maps = shardingOp.getIndexingMaps();
   unsigned numOperands = op->getNumOperands();
   shardingOption.shardingArray.resize(loopTypes.size());
-  llvm::SmallVector<int32_t> partialMeshAxes;
+  llvm::SmallVector<MeshAxis> partialMeshAxes;
   Partial partialType;
   llvm::SmallSet<unsigned, 4> visitedLoopIndices;
   bool anyShardingInResultsOrOperands = false;
@@ -277,7 +277,7 @@ FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(
     // shardingOption[index]
     for (auto it : llvm::zip(map.getResults(), shardAttr.getSplitAxes())) {
       AffineExpr expr = std::get<0>(it);
-      ArrayRef<int32_t> axes = std::get<1>(it).asArrayRef();
+      ArrayRef<MeshAxis> axes = std::get<1>(it).asArrayRef();
       auto dim = cast<AffineDimExpr>(expr);
       unsigned index = dim.getPosition();
       visitedLoopIndices.insert(index);
@@ -288,7 +288,7 @@ FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(
 
     // Handle the partial axes: at this stage, the exact loop index/indices
     // cannot be decided because there could be multiple reduction loops.
-    ArrayRef<int32_t> partialAxes = shardAttr.getPartialAxes();
+    ArrayRef<MeshAxis> partialAxes = shardAttr.getPartialAxes();
     if (!partialAxes.empty()) {
       if (!partialMeshAxes.empty())
         return op->emitOpError() << "at most one result with partial axes is "
@@ -321,7 +321,7 @@ FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(
     // then the operands with multiple loop indices.
     for (auto it : llvm::zip(map.getResults(), shardAttr.getSplitAxes())) {
       AffineExpr expr = std::get<0>(it);
-      ArrayRef<int32_t> axes = std::get<1>(it).asArrayRef();
+      ArrayRef<MeshAxis> axes = std::get<1>(it).asArrayRef();
       FailureOr<llvm::SmallSet<unsigned, 2>> loopIndices =
           checkOperandAffineExpr(expr, numDims);
       if (failed(loopIndices))
@@ -362,7 +362,7 @@ FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(
   if (!partialMeshAxes.empty()) {
     bool anyNonEmptyReductionLoop = llvm::any_of(
         llvm::enumerate(shardingOption.shardingArray), [&](auto it) {
-          SmallVector<int32_t> &subArray = it.value();
+          SmallVector<MeshAxis> &subArray = it.value();
           int64_t idx = it.index();
           return isReductionLoop(loopTypes[idx]) && !subArray.empty();
         });
@@ -406,8 +406,8 @@ static LogicalResult addShardOp(OpBuilder &b, OpResult result,
     return success();
 
   auto resultType = result.getType().cast<RankedTensorType>();
-  SmallVector<SmallVector<int32_t>> splitAxes(resultType.getRank());
-  SmallVector<int32_t> partialAxes;
+  SmallVector<SmallVector<MeshAxis>> splitAxes(resultType.getRank());
+  SmallVector<MeshAxis> partialAxes;
 
   // process the split axes
   for (auto it : llvm::enumerate(map.getResults())) {
@@ -431,7 +431,7 @@ static LogicalResult addShardOp(OpBuilder &b, OpResult result,
         assert(partialType == curPartialType &&
                "Only one reduction type is supported");
       partialType = curPartialType;
-      const SmallVector<int32_t> &axis = std::get<1>(it);
+      const SmallVector<MeshAxis> &axis = std::get<1>(it);
       partialAxes.append(axis);
     }
   }
@@ -459,7 +459,7 @@ static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,
     return success();
   Value operand = opOperand.get();
   auto operandType = operand.getType().cast<RankedTensorType>();
-  SmallVector<SmallVector<int32_t>> splitAxes(operandType.getRank());
+  SmallVector<SmallVector<MeshAxis>> splitAxes(operandType.getRank());
   unsigned numDims = map.getNumDims();
   for (auto it : llvm::enumerate(map.getResults())) {
     int64_t idx = it.index();
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
index 8d7e89662131a..37b8653595965 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
@@ -147,7 +147,7 @@ handlePartialAxesDuringResharding(OpBuilder &builder,
           .getResult()
           .cast<TypedValue<ShapedType>>();
 
-  llvm::SmallVector<int32_t> remainingPartialAxes;
+  llvm::SmallVector<MeshAxis> remainingPartialAxes;
   llvm::copy_if(sourceShardingPartialAxesSet,
                 std::back_inserter(allReduceMeshAxes),
                 [&targetShardingPartialAxesSet](Axis a) {
@@ -163,17 +163,17 @@ handlePartialAxesDuringResharding(OpBuilder &builder,
 static MeshShardingAttr
 targetShardingInSplitLastAxis(MLIRContext *ctx, MeshShardingAttr sourceSharding,
                               int64_t splitTensorAxis, MeshAxis splitMeshAxis) {
-  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+  SmallVector<MeshAxesAttr> targetShardingSplitAxes =
       llvm::to_vector(sourceSharding.getSplitAxes());
   while (static_cast<int64_t>(targetShardingSplitAxes.size()) <=
          splitTensorAxis) {
-    targetShardingSplitAxes.push_back(DenseI32ArrayAttr::get(ctx, {}));
+    targetShardingSplitAxes.push_back(MeshAxesAttr::get(ctx, {}));
   }
   auto targetSplitAxes =
       llvm::to_vector(targetShardingSplitAxes[splitTensorAxis].asArrayRef());
   targetSplitAxes.push_back(splitMeshAxis);
   targetShardingSplitAxes[splitTensorAxis] =
-      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+      MeshAxesAttr::get(ctx, targetSplitAxes);
   return MeshShardingAttr::get(
       ctx, sourceSharding.getCluster(), targetShardingSplitAxes,
       sourceSharding.getPartialAxes(), sourceSharding.getPartialType());
@@ -356,7 +356,7 @@ static MeshShardingAttr
 targetShardingInUnsplitLastAxis(MLIRContext *ctx,
                                 MeshShardingAttr sourceSharding,
                                 int64_t splitTensorAxis) {
-  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+  SmallVector<MeshAxesAttr> targetShardingSplitAxes =
       llvm::to_vector(sourceSharding.getSplitAxes());
   assert(static_cast<int64_t>(targetShardingSplitAxes.size()) >
          splitTensorAxis);
@@ -365,7 +365,7 @@ targetShardingInUnsplitLastAxis(MLIRContext *ctx,
 
   targetSplitAxes.pop_back();
   targetShardingSplitAxes[splitTensorAxis] =
-      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+      MeshAxesAttr::get(ctx, targetSplitAxes);
   return MeshShardingAttr::get(
       ctx, sourceSharding.getCluster(), targetShardingSplitAxes,
       sourceSharding.getPartialAxes(), sourceSharding.getPartialType());
@@ -475,11 +475,11 @@ static MeshShardingAttr
 targetShardingInMoveLastAxis(MLIRContext *ctx, MeshShardingAttr sourceSharding,
                              int64_t sourceTensorAxis,
                              int64_t targetTensorAxis) {
-  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+  SmallVector<MeshAxesAttr> targetShardingSplitAxes =
       llvm::to_vector(sourceSharding.getSplitAxes());
   while (static_cast<int64_t>(targetShardingSplitAxes.size()) <=
          targetTensorAxis) {
-    targetShardingSplitAxes.push_back(DenseI32ArrayAttr::get(ctx, {}));
+    targetShardingSplitAxes.push_back(MeshAxesAttr::get(ctx, {}));
   }
 
   auto sourceSplitAxes =
@@ -488,13 +488,13 @@ targetShardingInMoveLastAxis(MLIRContext *ctx, MeshShardingAttr sourceSharding,
   auto meshAxis = sourceSplitAxes.back();
   sourceSplitAxes.pop_back();
   targetShardingSplitAxes[sourceTensorAxis] =
-      DenseI32ArrayAttr::get(ctx, sourceSplitAxes);
+      MeshAxesAttr::get(ctx, sourceSplitAxes);
 
   auto targetSplitAxes =
       llvm::to_vector(targetShardingSplitAxes[targetTensorAxis].asArrayRef());
   targetSplitAxes.push_back(meshAxis);
   targetShardingSplitAxes[targetTensorAxis] =
-      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+      MeshAxesAttr::get(ctx, targetSplitAxes);
 
   return MeshShardingAttr::get(
       ctx, sourceSharding.getCluster(), targetShardingSplitAxes,

From e68a0320a1592bf408ac6458efa2d1c548cfed7a Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Wed, 3 Jan 2024 15:48:05 -0800
Subject: [PATCH 181/313] [libc] fix -Wcast-function-type via union rather than
 reinterpret_cast (#76875)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GCC build is producing the following diagnostic:

llvm-project/libc/src/signal/linux/signal_utils.h: In member function
    ‘__llvm_libc_18_0_0_git::KernelSigaction&
__llvm_libc_18_0_0_git::KernelSigaction::operator=(const sigaction&)’:
    llvm-project/libc/src/signal/linux/signal_utils.h:38:20: warning:
cast between incompatible function types from ‘void (*)(int, siginfo_t*,
    void*)’ to ‘void (*)(int)’ [-Wcast-function-type]
38 | sa_handler = reinterpret_cast<HandlerType *>(sa.sa_sigaction);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
llvm-project/libc/src/signal/linux/signal_utils.h: In member function
‘__llvm_libc_18_0_0_git::KernelSigaction::operator sigaction() const’:
    llvm-project/libc/src/signal/linux/signal_utils.h:51:25: warning:
cast between incompatible function types from ‘void (*)(int)’ to ‘void
    (*)(int, siginfo_t*, void*)’ [-Wcast-function-type]
51 | sa.sa_sigaction = reinterpret_cast<SiginfoHandlerType
*>(sa_handler);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Two issues here:
1. Clang supports -Wcast-function-type, but not as part of the -Wextra
group.
2. The existing implementation tried to work around the oddity that is
the
kernel's struct sigaction != POSIX via reinterpret_cast in a way that's
not
compatible with -Wcast-function-type. Just use a union which is well
defined
(and two function pointers are the same size.)

Link: https://github.com/llvm/llvm-project/issues/76872

Fixes: https://github.com/llvm/llvm-project/issues/74617
---
 libc/src/signal/linux/signal_utils.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libc/src/signal/linux/signal_utils.h b/libc/src/signal/linux/signal_utils.h
index ca6fd3aeb1a6e..5e9dd9a5c53ab 100644
--- a/libc/src/signal/linux/signal_utils.h
+++ b/libc/src/signal/linux/signal_utils.h
@@ -27,15 +27,12 @@ namespace LIBC_NAMESPACE {
 // handler taking siginfo_t * argument, one can set sa_handler to sa_sigaction
 // if SA_SIGINFO is set in sa_flags.
 struct KernelSigaction {
-  using HandlerType = void(int);
-  using SiginfoHandlerType = void(int, siginfo_t *, void *);
-
   LIBC_INLINE KernelSigaction &operator=(const struct sigaction &sa) {
     sa_flags = sa.sa_flags;
     sa_restorer = sa.sa_restorer;
     sa_mask = sa.sa_mask;
     if (sa_flags & SA_SIGINFO) {
-      sa_handler = reinterpret_cast<HandlerType *>(sa.sa_sigaction);
+      sa_sigaction = sa.sa_sigaction;
     } else {
       sa_handler = sa.sa_handler;
     }
@@ -48,13 +45,16 @@ struct KernelSigaction {
     sa.sa_mask = sa_mask;
     sa.sa_restorer = sa_restorer;
     if (sa_flags & SA_SIGINFO)
-      sa.sa_sigaction = reinterpret_cast<SiginfoHandlerType *>(sa_handler);
+      sa.sa_sigaction = sa_sigaction;
     else
       sa.sa_handler = sa_handler;
     return sa;
   }
 
-  HandlerType *sa_handler;
+  union {
+    void (*sa_handler)(int);
+    void (*sa_sigaction)(int, siginfo_t *, void *);
+  };
   unsigned long sa_flags;
   void (*sa_restorer)(void);
   // Our public definition of sigset_t matches that of the kernel's definition.

From bdaedffc43a608ef5fdc8a0e0c524be0a3a8f72e Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Wed, 3 Jan 2024 15:50:23 -0800
Subject: [PATCH 182/313] [lldb] Speculatively fix TestBreakpointSerialization
 on Windows

See: https://lab.llvm.org/buildbot/#/builders/219/builds/7961/steps/6/logs/stdio
---
 .../breakpoint/serialize/TestBreakpointSerialization.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
index b6cc3d9989a69..985bafabdc5bc 100644
--- a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
+++ b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
@@ -55,7 +55,7 @@ def test_resolver_serialization(self):
         self.setup_targets_and_cleanup()
 
         exe_path = self.getBuildArtifact("a.out")
-        exe_module = self.orig_target.module[exe_path]
+        exe_module = self.orig_target.module["a.out"]
         self.assertTrue(
             exe_module.IsValid(), "Failed to find the executable module in target"
         )

From ddfbca8b08941125b48039236a03635252298611 Mon Sep 17 00:00:00 2001
From: YunQiang Su <wzssyqa@gmail.com>
Date: Thu, 4 Jan 2024 08:04:22 +0800
Subject: [PATCH 183/313] Clang/MIPS: Use -mnan value for -mabs if not
 specified (#71157)

On most hardware, FCSR.ABS2008 is set the value same with FCSR.NAN2008.
Let's use this behaivor by default.

With this commit, `clang -target mips -mnan=2008 -c fabs.c` will imply
`-mabs=2008`.

And of course, `clang -mnan=2008 -mabs=legacy` can continue workable
like previous.

Co-authored-by: YunQiang Su <yunqiang.su@cipunited.com>
---
 clang/lib/Driver/ToolChains/Arch/Mips.cpp | 8 ++++++--
 clang/test/Driver/mips-features.c         | 8 +++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index f9f14c01b2b9f..fe9d112b8800b 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -221,6 +221,7 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   bool IsN64 = ABIName == "64";
   bool IsPIC = false;
   bool NonPIC = false;
+  bool HasNaN2008Opt = false;
 
   Arg *LastPICArg = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC,
                                     options::OPT_fpic, options::OPT_fno_pic,
@@ -285,9 +286,10 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   if (Arg *A = Args.getLastArg(options::OPT_mnan_EQ)) {
     StringRef Val = StringRef(A->getValue());
     if (Val == "2008") {
-      if (mips::getIEEE754Standard(CPUName) & mips::Std2008)
+      if (mips::getIEEE754Standard(CPUName) & mips::Std2008) {
         Features.push_back("+nan2008");
-      else {
+        HasNaN2008Opt = true;
+      } else {
         Features.push_back("-nan2008");
         D.Diag(diag::warn_target_unsupported_nan2008) << CPUName;
       }
@@ -323,6 +325,8 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
       D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getSpelling() << Val;
     }
+  } else if (HasNaN2008Opt) {
+    Features.push_back("+abs2008");
   }
 
   AddTargetFeature(Args, Features, options::OPT_msingle_float,
diff --git a/clang/test/Driver/mips-features.c b/clang/test/Driver/mips-features.c
index 5ae566774959f..fad6009ffb89b 100644
--- a/clang/test/Driver/mips-features.c
+++ b/clang/test/Driver/mips-features.c
@@ -213,7 +213,13 @@
 // RUN: %clang -target mips-linux-gnu -march=mips32r3 -### -c %s \
 // RUN:     -mnan=legacy -mnan=2008 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NAN2008 %s
-// CHECK-NAN2008: "-target-feature" "+nan2008"
+// CHECK-NAN2008: "-target-feature" "+nan2008" "-target-feature" "+abs2008"
+//
+// -mnan=2008 -mabs=legacy
+// RUN: %clang -target mips-linux-gnu -march=mips32r3 -### -c %s \
+// RUN:     -mabs=legacy -mnan=2008 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-ABSLEGACYNAN2008 %s
+// CHECK-ABSLEGACYNAN2008: "-target-feature" "+nan2008" "-target-feature" "-abs2008"
 //
 // -mnan=legacy
 // RUN: %clang -target mips-linux-gnu -march=mips32r3 -### -c %s \

From 7df28fd61aa4603846b3ce16f9f988ccc780a584 Mon Sep 17 00:00:00 2001
From: Micah Weston <micahsweston@gmail.com>
Date: Wed, 3 Jan 2024 19:17:44 -0500
Subject: [PATCH 184/313] [SHT_LLVM_BB_ADDR_MAP][AsmPrinter] Implements
 PGOAnalysisMap emitting in AsmPrinter with tests. (#75202)

Uses machine analyses to emit PGOAnalysisMap into the bb-addr-map ELF
section. Implements filecheck tests to verify emitting new fields.

This patch emits optional PGO related analyses into the bb-addr-map ELF
section during AsmPrinter. This currently supports Function Entry Count,
Machine Block Frequencies. and Machine Branch Probabilities. Each is
independently enabled via the `feature` byte of `bb-addr-map` for the given
function.

A part of [RFC - PGO Accuracy Metrics: Emitting and Evaluating Branch and Block Analysis](https://discourse.llvm.org/t/rfc-pgo-accuracy-metrics-emitting-and-evaluating-branch-and-block-analysis/73902).
---
 llvm/docs/Extensions.rst                      |  84 ++++++++++++
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  80 ++++++++++-
 ...ic-block-sections-labels-empty-function.ll |   6 +-
 ...asic-block-sections-labels-pgo-features.ll | 127 ++++++++++++++++++
 4 files changed, 292 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/basic-block-sections-labels-pgo-features.ll

diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
index 6e94840897832..74ca8cb0aa687 100644
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -451,6 +451,90 @@ Example:
    .uleb128  .LBB_END0_1-.LBB0_1          # BB_1 size
    .byte     y                            # BB_1 metadata
 
+PGO Analysis Map
+""""""""""""""""
+
+PGO related analysis data can be emitted after each function within the
+``SHT_LLVM_BB_ADDR_MAP`` through the optional ``pgo-analysis-map`` flag.
+Supported analyses currently are Function Entry Count, Basic Block Frequencies,
+and Branch Probabilities.
+
+Each analysis is enabled or disabled via a bit in the feature byte. Currently
+those bits are:
+
+#. Function Entry Count - Number of times the function was called as taken
+   from a PGO profile. This will always be zero if PGO was not used or the
+   function was not encountered in the profile.
+
+#. Basic Block Frequencies - Encoded as raw block frequency value taken from
+   MBFI analysis. This value is an integer that encodes the relative frequency
+   compared to the entry block. More information can be found in
+   'llvm/Support/BlockFrequency.h'.
+
+#. Branch Probabilities - Encoded as raw numerator for branch probability
+   taken from MBPI analysis. This value is the numerator for a fixed point ratio
+   defined in 'llvm/Support/BranchProbability.h'. It indicates the probability
+   that the block is followed by a given successor block during execution.
+
+This extra data requires version 2 or above. This is necessary since successors
+of basic blocks won't know their index but will know their BB ID.
+
+Example of BBAddrMap with PGO data:
+
+.. code-block:: gas
+
+  .section  ".llvm_bb_addr_map","",@llvm_bb_addr_map
+  .byte     2                             # version number
+  .byte     7                             # feature byte - PGO analyses enabled mask
+  .quad     .Lfunc_begin0                 # address of the function
+  .uleb128  4                             # number of basic blocks
+  # BB record for BB_0
+   .uleb128  0                            # BB_0 BB ID
+   .uleb128  .Lfunc_begin0-.Lfunc_begin0  # BB_0 offset relative to function entry (always zero)
+   .uleb128  .LBB_END0_0-.Lfunc_begin0    # BB_0 size
+   .byte     0x18                         # BB_0 metadata (multiple successors)
+  # BB record for BB_1
+   .uleb128  1                            # BB_1 BB ID
+   .uleb128  .LBB0_1-.LBB_END0_0          # BB_1 offset relative to the end of last block (BB_0).
+   .uleb128  .LBB_END0_1-.LBB0_1          # BB_1 size
+   .byte     0x0                          # BB_1 metadata (two successors)
+  # BB record for BB_2
+   .uleb128  2                            # BB_2 BB ID
+   .uleb128  .LBB0_2-.LBB_END1_0          # BB_2 offset relative to the end of last block (BB_1).
+   .uleb128  .LBB_END0_2-.LBB0_2          # BB_2 size
+   .byte     0x0                          # BB_2 metadata (one successor)
+  # BB record for BB_3
+   .uleb128  3                            # BB_3 BB ID
+   .uleb128  .LBB0_3-.LBB_END0_2          # BB_3 offset relative to the end of last block (BB_2).
+   .uleb128  .LBB_END0_3-.LBB0_3          # BB_3 size
+   .byte     0x0                          # BB_3 metadata (zero successors)
+  # PGO Analysis Map
+  .uleb128  1000                          # function entry count (only when enabled)
+  # PGO data record for BB_0
+   .uleb128  1000                         # BB_0 basic block frequency (only when enabled)
+   .uleb128  3                            # BB_0 successors count (only enabled with branch probabilities)
+   .uleb128  1                            # BB_0 successor 1 BB ID (only enabled with branch probabilities)
+   .uleb128  0x22222222                   # BB_0 successor 1 branch probability (only enabled with branch probabilities)
+   .uleb128  2                            # BB_0 successor 2 BB ID (only enabled with branch probabilities)
+   .uleb128  0x33333333                   # BB_0 successor 2 branch probability (only enabled with branch probabilities)
+   .uleb128  3                            # BB_0 successor 3 BB ID (only enabled with branch probabilities)
+   .uleb128  0xaaaaaaaa                   # BB_0 successor 3 branch probability (only enabled with branch probabilities)
+  # PGO data record for BB_1
+   .uleb128  133                          # BB_1 basic block frequency (only when enabled)
+   .uleb128  2                            # BB_1 successors count (only enabled with branch probabilities)
+   .uleb128  2                            # BB_1 successor 1 BB ID (only enabled with branch probabilities)
+   .uleb128  0x11111111                   # BB_1 successor 1 branch probability (only enabled with branch probabilities)
+   .uleb128  3                            # BB_1 successor 2 BB ID (only enabled with branch probabilities)
+   .uleb128  0x11111111                   # BB_1 successor 2 branch probability (only enabled with branch probabilities)
+  # PGO data record for BB_2
+   .uleb128  18                           # BB_2 basic block frequency (only when enabled)
+   .uleb128  1                            # BB_2 successors count (only enabled with branch probabilities)
+   .uleb128  3                            # BB_2 successor 1 BB ID (only enabled with branch probabilities)
+   .uleb128  0xffffffff                   # BB_2 successor 1 branch probability (only enabled with branch probabilities)
+  # PGO data record for BB_3
+   .uleb128  1000                         # BB_3 basic block frequency (only when enabled)
+   .uleb128  0                            # BB_3 successors count (only enabled with branch probabilities)
+
 ``SHT_LLVM_OFFLOADING`` Section (offloading data)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 This section stores the binary data used to perform offloading device linking
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 4dd27702786e4..7df1c82bf357f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -140,6 +141,26 @@ static cl::opt<std::string> BasicBlockProfileDump(
              "performed with -basic-block-sections=labels. Enabling this "
              "flag during in-process ThinLTO is not supported."));
 
+// This is a replication of fields of object::PGOAnalysisMap::Features. It
+// should match the order of the fields so that
+// `object::PGOAnalysisMap::Features::decode(PgoAnalysisMapFeatures.getBits())`
+// succeeds.
+enum class PGOMapFeaturesEnum {
+  FuncEntryCount,
+  BBFreq,
+  BrProb,
+};
+static cl::bits<PGOMapFeaturesEnum> PgoAnalysisMapFeatures(
+    "pgo-analysis-map", cl::Hidden, cl::CommaSeparated,
+    cl::values(clEnumValN(PGOMapFeaturesEnum::FuncEntryCount,
+                          "func-entry-count", "Function Entry Count"),
+               clEnumValN(PGOMapFeaturesEnum::BBFreq, "bb-freq",
+                          "Basic Block Frequency"),
+               clEnumValN(PGOMapFeaturesEnum::BrProb, "br-prob",
+                          "Branch Probability")),
+    cl::desc("Enable extended information within the BBAddrMap that is "
+             "extracted from PGO related analysis."));
+
 const char DWARFGroupName[] = "dwarf";
 const char DWARFGroupDescription[] = "DWARF Emission";
 const char DbgTimerName[] = "emit";
@@ -428,6 +449,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
   AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.addRequired<MachineBranchProbabilityInfo>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
@@ -1379,7 +1401,8 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
   uint8_t BBAddrMapVersion = OutStreamer->getContext().getBBAddrMapVersion();
   OutStreamer->emitInt8(BBAddrMapVersion);
   OutStreamer->AddComment("feature");
-  OutStreamer->emitInt8(0);
+  auto FeaturesBits = static_cast<uint8_t>(PgoAnalysisMapFeatures.getBits());
+  OutStreamer->emitInt8(FeaturesBits);
   OutStreamer->AddComment("function address");
   OutStreamer->emitSymbolValue(FunctionSymbol, getPointerSize());
   OutStreamer->AddComment("number of basic blocks");
@@ -1409,6 +1432,51 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
     OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB));
     PrevMBBEndSymbol = MBB.getEndSymbol();
   }
+
+  if (FeaturesBits != 0) {
+    assert(BBAddrMapVersion >= 2 &&
+           "PGOAnalysisMap only supports version 2 or later");
+
+    auto FeatEnable =
+        cantFail(object::PGOAnalysisMap::Features::decode(FeaturesBits));
+
+    if (FeatEnable.FuncEntryCount) {
+      OutStreamer->AddComment("function entry count");
+      auto MaybeEntryCount = MF.getFunction().getEntryCount();
+      OutStreamer->emitULEB128IntValue(
+          MaybeEntryCount ? MaybeEntryCount->getCount() : 0);
+    }
+    const MachineBlockFrequencyInfo *MBFI =
+        FeatEnable.BBFreq
+            ? &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI()
+            : nullptr;
+    const MachineBranchProbabilityInfo *MBPI =
+        FeatEnable.BrProb ? &getAnalysis<MachineBranchProbabilityInfo>()
+                          : nullptr;
+
+    if (FeatEnable.BBFreq || FeatEnable.BrProb) {
+      for (const MachineBasicBlock &MBB : MF) {
+        if (FeatEnable.BBFreq) {
+          OutStreamer->AddComment("basic block frequency");
+          OutStreamer->emitULEB128IntValue(
+              MBFI->getBlockFreq(&MBB).getFrequency());
+        }
+        if (FeatEnable.BrProb) {
+          unsigned SuccCount = MBB.succ_size();
+          OutStreamer->AddComment("basic block successor count");
+          OutStreamer->emitULEB128IntValue(SuccCount);
+          for (const MachineBasicBlock *SuccMBB : MBB.successors()) {
+            OutStreamer->AddComment("successor BB ID");
+            OutStreamer->emitULEB128IntValue(SuccMBB->getBBID()->BaseID);
+            OutStreamer->AddComment("successor branch probability");
+            OutStreamer->emitULEB128IntValue(
+                MBPI->getEdgeProbability(&MBB, SuccMBB).getNumerator());
+          }
+        }
+      }
+    }
+  }
+
   OutStreamer->popSection();
 }
 
@@ -1934,8 +2002,14 @@ void AsmPrinter::emitFunctionBody() {
 
   // Emit section containing BB address offsets and their metadata, when
   // BB labels are requested for this function. Skip empty functions.
-  if (MF->hasBBLabels() && HasAnyRealCode)
-    emitBBAddrMapSection(*MF);
+  if (HasAnyRealCode) {
+    if (MF->hasBBLabels())
+      emitBBAddrMapSection(*MF);
+    else if (PgoAnalysisMapFeatures.getBits() != 0)
+      MF->getContext().reportWarning(
+          SMLoc(), "pgo-analysis-map is enabled for function " + MF->getName() +
+                       " but it does not have labels");
+  }
 
   // Emit sections containing instruction and function PCs.
   emitPCSections(*MF);
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels-empty-function.ll b/llvm/test/CodeGen/X86/basic-block-sections-labels-empty-function.ll
index 7b7bbb95fb4e2..42d09212e6691 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-labels-empty-function.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-labels-empty-function.ll
@@ -1,5 +1,6 @@
 ;; Verify that the BB address map is not emitted for empty functions.
-; RUN: llc < %s -mtriple=x86_64 -basic-block-sections=labels | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64 -basic-block-sections=labels | FileCheck %s --check-prefixes=CHECK,BASIC
+; RUN: llc < %s -mtriple=x86_64 -basic-block-sections=labels -pgo-analysis-map=func-entry-count,bb-freq | FileCheck %s --check-prefixes=CHECK,PGO
 
 define void @empty_func() {
 entry:
@@ -19,5 +20,6 @@ entry:
 ; CHECK:	.Lfunc_begin1:
 ; CHECK:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text{{$}}
 ; CHECK-NEXT:		.byte 2			# version
-; CHECK-NEXT:		.byte 0			# feature
+; BASIC-NEXT:		.byte 0			# feature
+; PGO-NEXT:		.byte 3			# feature
 ; CHECK-NEXT:		.quad	.Lfunc_begin1	# function address
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels-pgo-features.ll b/llvm/test/CodeGen/X86/basic-block-sections-labels-pgo-features.ll
new file mode 100644
index 0000000000000..92d3c88b4f601
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-labels-pgo-features.ll
@@ -0,0 +1,127 @@
+; Check the basic block sections labels option
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels | FileCheck %s --check-prefixes=CHECK,BASIC
+
+;; Also verify this holds for all PGO features enabled
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -pgo-analysis-map=func-entry-count,bb-freq,br-prob | FileCheck %s --check-prefixes=CHECK,PGO-ALL,PGO-FEC,PGO-BBF,PGO-BRP
+
+;; Also verify that pgo extension only includes the enabled feature
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -pgo-analysis-map=func-entry-count | FileCheck %s --check-prefixes=CHECK,PGO-FEC,FEC-ONLY
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -pgo-analysis-map=bb-freq | FileCheck %s --check-prefixes=CHECK,PGO-BBF,BBF-ONLY
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -pgo-analysis-map=br-prob | FileCheck %s --check-prefixes=CHECK,PGO-BRP,BRP-ONLY
+
+
+define void @_Z3bazb(i1 zeroext, i1 zeroext) personality ptr @__gxx_personality_v0 !prof !0  {
+  br i1 %0, label %3, label %8, !prof !1
+
+3:
+  %4 = invoke i32 @_Z3barv()
+          to label %8 unwind label %6
+  br label %10
+
+6:
+  landingpad { ptr, i32 }
+          catch ptr null
+  br label %12
+
+8:
+  %9 = call i32 @_Z3foov()
+  br i1 %1, label %12, label %10, !prof !2
+
+10:
+  %11 = select i1 %1, ptr blockaddress(@_Z3bazb, %3), ptr blockaddress(@_Z3bazb, %12) ; <ptr> [#uses=1]
+  indirectbr ptr %11, [label %3, label %12], !prof !3
+
+12:
+  ret void
+}
+
+declare i32 @_Z3barv() #1
+
+declare i32 @_Z3foov() #1
+
+declare i32 @__gxx_personality_v0(...)
+
+!0 = !{!"function_entry_count", i64 100}
+!1 = !{!"branch_weights", i32 80, i32 20}
+!2 = !{!"branch_weights", i32 70, i32 10}
+!3 = !{!"branch_weights", i32 15, i32 5}
+
+; CHECK:	.section .text._Z3bazb,"ax",@progbits{{$}}
+; CHECK-LABEL:	_Z3bazb:
+; CHECK-LABEL:	.Lfunc_begin0:
+; CHECK-LABEL:	.LBB_END0_0:
+; CHECK-LABEL:	.LBB0_1:
+; CHECK-LABEL:	.LBB_END0_1:
+; CHECK-LABEL:	.LBB0_2:
+; CHECK-LABEL:	.LBB_END0_2:
+; CHECK-LABEL:	.LBB0_3:
+; CHECK-LABEL:	.LBB_END0_3:
+; CHECK-LABEL:	.Lfunc_end0:
+
+; CHECK: 	.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
+; CHECK-NEXT:	.byte	2		# version
+; BASIC-NEXT:	.byte	0		# feature
+; PGO-ALL-NEXT:	.byte	7		# feature
+; FEC-ONLY-NEXT:.byte	1		# feature
+; BBF-ONLY-NEXT:.byte	2		# feature
+; BRP-ONLY-NEXT:.byte	4		# feature
+; CHECK-NEXT:	.quad	.Lfunc_begin0	# function address
+; CHECK-NEXT:	.byte	6		# number of basic blocks
+; CHECK-NEXT:	.byte	0		# BB id
+; CHECK-NEXT:	.uleb128 .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT:	.uleb128 .LBB_END0_0-.Lfunc_begin0
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:	.byte	1		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_1-.LBB_END0_0
+; CHECK-NEXT:	.uleb128 .LBB_END0_1-.LBB0_1
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:	.byte	3		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_2-.LBB_END0_1
+; CHECK-NEXT:	.uleb128 .LBB_END0_2-.LBB0_2
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:	.byte	5		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_3-.LBB_END0_2
+; CHECK-NEXT:	.uleb128 .LBB_END0_3-.LBB0_3
+; CHECK-NEXT:	.byte	1
+; CHECK-NEXT:	.byte	4		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_4-.LBB_END0_3
+; CHECK-NEXT:	.uleb128 .LBB_END0_4-.LBB0_4
+; CHECK-NEXT:	.byte	16
+; CHECK-NEXT:	.byte	2		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_5-.LBB_END0_4
+; CHECK-NEXT:	.uleb128 .LBB_END0_5-.LBB0_5
+; CHECK-NEXT:	.byte	4
+
+;; PGO Analysis Map
+; PGO-FEC-NEXT:	.byte	100		# function entry count
+; PGO-BBF-NEXT:	.ascii	"\271\235\376\332\245\200\356\017"	# basic block frequency
+; PGO-BRP-NEXT:	.byte	2		# basic block successor count
+; PGO-BRP-NEXT:	.byte	1		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\346\314\231\263\006"	# successor branch probability
+; PGO-BRP-NEXT:	.byte	3		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\232\263\346\314\001"	# successor branch probability
+; PGO-BBF-NEXT:	.ascii	"\202\301\341\375\205\200\200\003"	# basic block frequency
+; PGO-BRP-NEXT:	.byte	2		# basic block successor count
+; PGO-BRP-NEXT:	.byte	3		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\200\360\377\377\007"	# successor branch probability
+; PGO-BRP-NEXT:	.byte	2		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\200\020"	# successor branch probability
+; PGO-BBF-NEXT:	.ascii	"\200\200\200\200\200\200\200 "	# basic block frequency
+; PGO-BRP-NEXT:	.byte	2		# basic block successor count
+; PGO-BRP-NEXT:	.byte	5		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\200\200\200\200\007"	# successor branch probability
+; PGO-BRP-NEXT:	.byte 	4		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\200\200\200\200\001"	# successor branch probability
+; PGO-BBF-NEXT:	.ascii	"\271\235\376\332\245\200\356\017"	# basic block frequency
+; PGO-BRP-NEXT:	.byte	0		# basic block successor count
+; PGO-BBF-NEXT:	.ascii	"\210\214\356\257\200\200\230\002"	# basic block frequency
+; PGO-BRP-NEXT:	.byte	2		# basic block successor count
+; PGO-BRP-NEXT:	.byte 	1		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\200\200\200\200\006"	# successor branch probability
+; PGO-BRP-NEXT:	.byte	5		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\200\200\200\200\002"	# successor branch probability
+; PGO-BBF-NEXT:	.ascii	"\235\323\243\200#"	# basic block frequency
+; PGO-BRP-NEXT:	.byte	1		# basic block successor count
+; PGO-BRP-NEXT:	.byte	5		# successor BB ID
+; PGO-BRP-NEXT:	.ascii	"\200\200\200\200\b"	# successor branch probability
+

From dd9681f839c23e9caa6c495fb5a0df3625715348 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 4 Jan 2024 10:12:12 +0800
Subject: [PATCH 185/313] [X86][MC] Support encoding/decoding for APX variant
 INC/DEC/ADCX/ADOX instructions (#76721)

Four variants: promoted legacy, ND (new data destination), NF (no flags
update) and NF_ND (NF + ND).

The syntax of NF instructions is aligned with GNU binutils.
https://sourceware.org/pipermail/binutils/2023-September/129545.html
---
 .../X86/MCTargetDesc/X86MCCodeEmitter.cpp     |   3 +
 llvm/lib/Target/X86/X86InstrArithmetic.td     | 171 ++++++++++++++----
 llvm/lib/Target/X86/X86InstrUtils.td          |   4 +-
 llvm/test/MC/Disassembler/X86/apx/adx.txt     |  66 +++++++
 llvm/test/MC/Disassembler/X86/apx/dec.txt     | 130 +++++++++++++
 llvm/test/MC/Disassembler/X86/apx/inc.txt     | 130 +++++++++++++
 llvm/test/MC/X86/apx/adx-att.s                |  53 ++++++
 llvm/test/MC/X86/apx/adx-intel.s              |  50 +++++
 llvm/test/MC/X86/apx/dec-att.s                | 101 +++++++++++
 llvm/test/MC/X86/apx/dec-intel.s              |  98 ++++++++++
 llvm/test/MC/X86/apx/inc-att.s                | 101 +++++++++++
 llvm/test/MC/X86/apx/inc-intel.s              |  98 ++++++++++
 llvm/test/TableGen/x86-fold-tables.inc        |  32 ++++
 13 files changed, 1003 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/adx.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/dec.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/inc.txt
 create mode 100644 llvm/test/MC/X86/apx/adx-att.s
 create mode 100644 llvm/test/MC/X86/apx/adx-intel.s
 create mode 100644 llvm/test/MC/X86/apx/dec-att.s
 create mode 100644 llvm/test/MC/X86/apx/dec-intel.s
 create mode 100644 llvm/test/MC/X86/apx/inc-att.s
 create mode 100644 llvm/test/MC/X86/apx/inc-intel.s

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 924956295e7c6..f7c361393fea6 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1650,6 +1650,9 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
+    if (IsND) // Skip new data destination
+      ++CurOp;
+
     emitRegModRMByte(MI.getOperand(SrcRegNum),
                      getX86RegNum(MI.getOperand(CurOp)), CB);
     CurOp = SrcRegNum + 1;
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 08f5a8860b84a..ed9f45bdd9d15 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -184,52 +184,139 @@ def IMUL64rmi32 : IMulOpMI_R<Xi64, WriteIMul64Imm>;
 //===----------------------------------------------------------------------===//
 // INC and DEC Instructions
 //
-class IncOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag> {
+class IncOpR_RF<X86TypeInfo t, bit ndd = 0> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag, ndd> {
   let Pattern = [(set t.RegClass:$dst, EFLAGS,
                  (X86add_flag_nocf t.RegClass:$src1, 1))];
 }
-class DecOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag> {
+class DecOpR_RF<X86TypeInfo t, bit ndd = 0> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag, ndd> {
   let Pattern = [(set t.RegClass:$dst, EFLAGS,
                  (X86sub_flag_nocf t.RegClass:$src1, 1))];
 }
-class IncOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> {
+class IncOpR_R<X86TypeInfo t, bit ndd = 0> : UnaryOpR_R<0xFF, MRM0r, "inc", t, null_frag, ndd>;
+class DecOpR_R<X86TypeInfo t, bit ndd = 0> : UnaryOpR_R<0xFF, MRM1r, "dec", t, null_frag, ndd>;
+class IncOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> {
   let Pattern = [(store (add (t.LoadNode addr:$src1), 1), addr:$src1),
                  (implicit EFLAGS)];
 }
-class DecOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> {
+class DecOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> {
   let Pattern = [(store (add (t.LoadNode addr:$src1), -1), addr:$src1),
                  (implicit EFLAGS)];
 }
+class IncOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xFF, MRM0m, "inc", t, null_frag> {
+  let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), 1))];
+}
+class DecOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xFF, MRM1m, "dec", t, null_frag> {
+  let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), -1))];
+}
+class IncOpM_M<X86TypeInfo t> : UnaryOpM_M<0xFF, MRM0m, "inc", t, null_frag>;
+class DecOpM_M<X86TypeInfo t> : UnaryOpM_M<0xFF, MRM1m, "dec", t, null_frag>;
+class IncOpM_R<X86TypeInfo t> : UnaryOpM_R<0xFF, MRM0m, "inc", t, null_frag>;
+class DecOpM_R<X86TypeInfo t> : UnaryOpM_R<0xFF, MRM1m, "dec", t, null_frag>;
+
 // IncDec_Alt - Instructions like "inc reg" short forms.
 // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
 class IncDec_Alt<bits<8> o, string m, X86TypeInfo t>
   : UnaryOpR_RF<o, AddRegFrm, m, t, null_frag>, Requires<[Not64BitMode]>;
 
 let isConvertibleToThreeAddress = 1 in {
-def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16;
-def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32;
-def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16;
-def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32;
-def INC8r  : IncOpR_RF<Xi8>;
-def INC16r : IncOpR_RF<Xi16>, OpSize16;
-def INC32r : IncOpR_RF<Xi32>, OpSize32;
-def INC64r : IncOpR_RF<Xi64>;
-def DEC8r  : DecOpR_RF<Xi8>;
-def DEC16r : DecOpR_RF<Xi16>, OpSize16;
-def DEC32r : DecOpR_RF<Xi32>, OpSize32;
-def DEC64r : DecOpR_RF<Xi64>;
+  def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16;
+  def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32;
+  def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16;
+  def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32;
+  let Predicates = [NoNDD] in {
+    def INC8r  : IncOpR_RF<Xi8>;
+    def INC16r : IncOpR_RF<Xi16>, OpSize16;
+    def INC32r : IncOpR_RF<Xi32>, OpSize32;
+    def INC64r : IncOpR_RF<Xi64>;
+    def DEC8r  : DecOpR_RF<Xi8>;
+    def DEC16r : DecOpR_RF<Xi16>, OpSize16;
+    def DEC32r : DecOpR_RF<Xi32>, OpSize32;
+    def DEC64r : DecOpR_RF<Xi64>;
+  }
+  let Predicates = [HasNDD, In64BitMode] in {
+    def INC8r_ND  : IncOpR_RF<Xi8, 1>;
+    def INC16r_ND : IncOpR_RF<Xi16, 1>, PD;
+    def INC32r_ND : IncOpR_RF<Xi32, 1>;
+    def INC64r_ND : IncOpR_RF<Xi64, 1>;
+    def DEC8r_ND  : DecOpR_RF<Xi8, 1>;
+    def DEC16r_ND : DecOpR_RF<Xi16, 1>, PD;
+    def DEC32r_ND : DecOpR_RF<Xi32, 1>;
+    def DEC64r_ND : DecOpR_RF<Xi64, 1>;
+  }
+  let Predicates = [In64BitMode], Pattern = [(null_frag)] in {
+    def INC8r_NF  : IncOpR_R<Xi8>, NF;
+    def INC16r_NF : IncOpR_R<Xi16>, NF, PD;
+    def INC32r_NF : IncOpR_R<Xi32>, NF;
+    def INC64r_NF : IncOpR_R<Xi64>, NF;
+    def DEC8r_NF  : DecOpR_R<Xi8>, NF;
+    def DEC16r_NF : DecOpR_R<Xi16>, NF, PD;
+    def DEC32r_NF : DecOpR_R<Xi32>, NF;
+    def DEC64r_NF : DecOpR_R<Xi64>, NF;
+    def INC8r_NF_ND  : IncOpR_R<Xi8, 1>, NF;
+    def INC16r_NF_ND : IncOpR_R<Xi16, 1>, NF, PD;
+    def INC32r_NF_ND : IncOpR_R<Xi32, 1>, NF;
+    def INC64r_NF_ND : IncOpR_R<Xi64, 1>, NF;
+    def DEC8r_NF_ND  : DecOpR_R<Xi8, 1>, NF;
+    def DEC16r_NF_ND : DecOpR_R<Xi16, 1>, NF, PD;
+    def DEC32r_NF_ND : DecOpR_R<Xi32, 1>, NF;
+    def DEC64r_NF_ND : DecOpR_R<Xi64, 1>, NF;
+    def INC8r_EVEX  : IncOpR_RF<Xi8>, PL;
+    def INC16r_EVEX : IncOpR_RF<Xi16>, PL, PD;
+    def INC32r_EVEX : IncOpR_RF<Xi32>, PL;
+    def INC64r_EVEX : IncOpR_RF<Xi64>, PL;
+    def DEC8r_EVEX  : DecOpR_RF<Xi8>, PL;
+    def DEC16r_EVEX : DecOpR_RF<Xi16>, PL, PD;
+    def DEC32r_EVEX : DecOpR_RF<Xi32>, PL;
+    def DEC64r_EVEX : DecOpR_RF<Xi64>, PL;
+  }
 }
 let Predicates = [UseIncDec] in {
-def INC8m  : IncOpM_M<Xi8>;
-def INC16m : IncOpM_M<Xi16>, OpSize16;
-def INC32m : IncOpM_M<Xi32>, OpSize32;
-def DEC8m  : DecOpM_M<Xi8>;
-def DEC16m : DecOpM_M<Xi16>, OpSize16;
-def DEC32m : DecOpM_M<Xi32>, OpSize32;
+  def INC8m  : IncOpM_MF<Xi8>;
+  def INC16m : IncOpM_MF<Xi16>, OpSize16;
+  def INC32m : IncOpM_MF<Xi32>, OpSize32;
+  def DEC8m  : DecOpM_MF<Xi8>;
+  def DEC16m : DecOpM_MF<Xi16>, OpSize16;
+  def DEC32m : DecOpM_MF<Xi32>, OpSize32;
 }
 let Predicates = [UseIncDec, In64BitMode] in {
-def INC64m : IncOpM_M<Xi64>;
-def DEC64m : DecOpM_M<Xi64>;
+  def INC64m : IncOpM_MF<Xi64>;
+  def DEC64m : DecOpM_MF<Xi64>;
+}
+let Predicates = [HasNDD, In64BitMode, UseIncDec] in {
+  def INC8m_ND  : IncOpM_RF<Xi8>;
+  def INC16m_ND : IncOpM_RF<Xi16>, PD;
+  def INC32m_ND : IncOpM_RF<Xi32>;
+  def DEC8m_ND  : DecOpM_RF<Xi8>;
+  def DEC16m_ND : DecOpM_RF<Xi16>, PD;
+  def DEC32m_ND : DecOpM_RF<Xi32>;
+  def INC64m_ND : IncOpM_RF<Xi64>;
+  def DEC64m_ND : DecOpM_RF<Xi64>;
+}
+let Predicates = [In64BitMode], Pattern = [(null_frag)] in {
+  def INC8m_NF  : IncOpM_M<Xi8>, NF;
+  def INC16m_NF : IncOpM_M<Xi16>, NF, PD;
+  def INC32m_NF : IncOpM_M<Xi32>, NF;
+  def INC64m_NF : IncOpM_M<Xi64>, NF;
+  def DEC8m_NF  : DecOpM_M<Xi8>, NF;
+  def DEC16m_NF : DecOpM_M<Xi16>, NF, PD;
+  def DEC32m_NF : DecOpM_M<Xi32>, NF;
+  def DEC64m_NF : DecOpM_M<Xi64>, NF;
+  def INC8m_NF_ND  : IncOpM_R<Xi8>, NF;
+  def INC16m_NF_ND : IncOpM_R<Xi16>, NF, PD;
+  def INC32m_NF_ND : IncOpM_R<Xi32>, NF;
+  def INC64m_NF_ND : IncOpM_R<Xi64>, NF;
+  def DEC8m_NF_ND  : DecOpM_R<Xi8>, NF;
+  def DEC16m_NF_ND : DecOpM_R<Xi16>, NF, PD;
+  def DEC32m_NF_ND : DecOpM_R<Xi32>, NF;
+  def DEC64m_NF_ND : DecOpM_R<Xi64>, NF;
+  def INC8m_EVEX  : IncOpM_MF<Xi8>, PL;
+  def INC16m_EVEX : IncOpM_MF<Xi16>, PL, PD;
+  def INC32m_EVEX : IncOpM_MF<Xi32>, PL;
+  def INC64m_EVEX : IncOpM_MF<Xi64>, PL;
+  def DEC8m_EVEX  : DecOpM_MF<Xi8>, PL;
+  def DEC16m_EVEX : DecOpM_MF<Xi16>, PL, PD;
+  def DEC32m_EVEX : DecOpM_MF<Xi32>, PL;
+  def DEC64m_EVEX : DecOpM_MF<Xi64>, PL;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1119,14 +1206,34 @@ defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W;
 // We don't have patterns for these as there is no advantage over ADC for
 // most code.
 let Form = MRMSrcReg in {
-def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD;
-def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD;
-def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS;
-def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS;
+  def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32>, T8, PD;
+  def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64>, T8, PD;
+  def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32>, T8, XS;
+  def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64>, T8, XS;
+  let Predicates =[In64BitMode] in {
+    def ADCX32rr_EVEX : BinOpRRF_RF<0x66, "adcx", Xi32>, EVEX, T_MAP4, PD;
+    def ADCX64rr_EVEX : BinOpRRF_RF<0x66, "adcx", Xi64>, EVEX, T_MAP4, PD;
+    def ADOX32rr_EVEX : BinOpRRF_RF<0x66, "adox", Xi32>, EVEX, T_MAP4, XS;
+    def ADOX64rr_EVEX : BinOpRRF_RF<0x66, "adox", Xi64>, EVEX, T_MAP4, XS;
+    def ADCX32rr_ND : BinOpRRF_RF<0x66, "adcx", Xi32, null_frag, 1>, PD;
+    def ADCX64rr_ND : BinOpRRF_RF<0x66, "adcx", Xi64, null_frag, 1>, PD;
+    def ADOX32rr_ND : BinOpRRF_RF<0x66, "adox", Xi32, null_frag, 1>, XS;
+    def ADOX64rr_ND : BinOpRRF_RF<0x66, "adox", Xi64, null_frag, 1>, XS;
+  }
 }
 let Form = MRMSrcMem in {
-def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD;
-def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD;
-def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS;
-def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS;
+  def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32>, T8, PD;
+  def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64>, T8, PD;
+  def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32>, T8, XS;
+  def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64>, T8, XS;
+  let Predicates =[In64BitMode] in {
+    def ADCX32rm_EVEX : BinOpRMF_RF<0x66, "adcx", Xi32>, EVEX, T_MAP4, PD;
+    def ADCX64rm_EVEX : BinOpRMF_RF<0x66, "adcx", Xi64>, EVEX, T_MAP4, PD;
+    def ADOX32rm_EVEX : BinOpRMF_RF<0x66, "adox", Xi32>, EVEX, T_MAP4, XS;
+    def ADOX64rm_EVEX : BinOpRMF_RF<0x66, "adox", Xi64>, EVEX, T_MAP4, XS;
+    def ADCX32rm_ND : BinOpRMF_RF<0x66, "adcx", Xi32, null_frag, 1>, PD;
+    def ADCX64rm_ND : BinOpRMF_RF<0x66, "adcx", Xi64, null_frag, 1>, PD;
+    def ADOX32rm_ND : BinOpRMF_RF<0x66, "adox", Xi32, null_frag, 1>, XS;
+    def ADOX64rm_ND : BinOpRMF_RF<0x66, "adox", Xi64, null_frag, 1>, XS;
+  }
 }
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index da85922a018d6..132941a5734cc 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -1005,7 +1005,7 @@ class BinOpRR_RF_Rev<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>
 }
 // BinOpRRF_RF - Instructions that read "reg, reg", write "reg" and read/write
 // EFLAGS.
-class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
+class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
   : BinOpRR<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
             [(set t.RegClass:$dst, EFLAGS,
              (node t.RegClass:$src1, t.RegClass:$src2,
@@ -1041,7 +1041,7 @@ class BinOpRM_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit
              (t.LoadNode addr:$src2)))]>, DefEFLAGS, NDD<ndd>;
 // BinOpRMF_RF - Instructions that read "reg, [mem]", write "reg" and read/write
 // EFLAGS.
-class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
+class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
   : BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
             [(set t.RegClass:$dst, EFLAGS,
              (node t.RegClass:$src1, (t.LoadNode addr:$src2), EFLAGS))]>,
diff --git a/llvm/test/MC/Disassembler/X86/apx/adx.txt b/llvm/test/MC/Disassembler/X86/apx/adx.txt
new file mode 100644
index 0000000000000..926cd14bef110
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/adx.txt
@@ -0,0 +1,66 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   adcxl	%r16d, %r17d
+# INTEL: adcx	r17d, r16d
+0x62,0xec,0x7d,0x08,0x66,0xc8
+
+# ATT:   adcxl	%r16d, %r17d, %r18d
+# INTEL: adcx	r18d, r17d, r16d
+0x62,0xec,0x6d,0x10,0x66,0xc8
+
+# ATT:   adcxq	%r16, %r17
+# INTEL: adcx	r17, r16
+0x62,0xec,0xfd,0x08,0x66,0xc8
+
+# ATT:   adcxq	%r16, %r17, %r18
+# INTEL: adcx	r18, r17, r16
+0x62,0xec,0xed,0x10,0x66,0xc8
+
+# ATT:   adcxl	(%r16), %r17d
+# INTEL: adcx	r17d, dword ptr [r16]
+0x62,0xec,0x7d,0x08,0x66,0x08
+
+# ATT:   adcxl	(%r16), %r17d, %r18d
+# INTEL: adcx	r18d, r17d, dword ptr [r16]
+0x62,0xec,0x6d,0x10,0x66,0x08
+
+# ATT:   adcxq	(%r16), %r17
+# INTEL: adcx	r17, qword ptr [r16]
+0x62,0xec,0xfd,0x08,0x66,0x08
+
+# ATT:   adcxq	(%r16), %r17, %r18
+# INTEL: adcx	r18, r17, qword ptr [r16]
+0x62,0xec,0xed,0x10,0x66,0x08
+
+# ATT:   adoxl	%r16d, %r17d
+# INTEL: adox	r17d, r16d
+0x62,0xec,0x7e,0x08,0x66,0xc8
+
+# ATT:   adoxl	%r16d, %r17d, %r18d
+# INTEL: adox	r18d, r17d, r16d
+0x62,0xec,0x6e,0x10,0x66,0xc8
+
+# ATT:   adoxq	%r16, %r17
+# INTEL: adox	r17, r16
+0x62,0xec,0xfe,0x08,0x66,0xc8
+
+# ATT:   adoxq	%r16, %r17, %r18
+# INTEL: adox	r18, r17, r16
+0x62,0xec,0xee,0x10,0x66,0xc8
+
+# ATT:   adoxl	(%r16), %r17d
+# INTEL: adox	r17d, dword ptr [r16]
+0x62,0xec,0x7e,0x08,0x66,0x08
+
+# ATT:   adoxl	(%r16), %r17d, %r18d
+# INTEL: adox	r18d, r17d, dword ptr [r16]
+0x62,0xec,0x6e,0x10,0x66,0x08
+
+# ATT:   adoxq	(%r16), %r17
+# INTEL: adox	r17, qword ptr [r16]
+0x62,0xec,0xfe,0x08,0x66,0x08
+
+# ATT:   adoxq	(%r16), %r17, %r18
+# INTEL: adox	r18, r17, qword ptr [r16]
+0x62,0xec,0xee,0x10,0x66,0x08
diff --git a/llvm/test/MC/Disassembler/X86/apx/dec.txt b/llvm/test/MC/Disassembler/X86/apx/dec.txt
new file mode 100644
index 0000000000000..029cf19255857
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/dec.txt
@@ -0,0 +1,130 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	decb	%bl
+# INTEL: {evex}	dec	bl
+0x62,0xf4,0x7c,0x08,0xfe,0xcb
+
+# ATT:   {nf}	decb	%bl
+# INTEL: {nf}	dec	bl
+0x62,0xf4,0x7c,0x0c,0xfe,0xcb
+
+# ATT:   decb	%bl, %bl
+# INTEL: dec	bl, bl
+0x62,0xf4,0x64,0x18,0xfe,0xcb
+
+# ATT:   {nf}	decb	%bl, %bl
+# INTEL: {nf}	dec	bl, bl
+0x62,0xf4,0x64,0x1c,0xfe,0xcb
+
+# ATT:   {evex}	decw	%dx
+# INTEL: {evex}	dec	dx
+0x62,0xf4,0x7d,0x08,0xff,0xca
+
+# ATT:   {nf}	decw	%dx
+# INTEL: {nf}	dec	dx
+0x62,0xf4,0x7d,0x0c,0xff,0xca
+
+# ATT:   decw	%dx, %dx
+# INTEL: dec	dx, dx
+0x62,0xf4,0x6d,0x18,0xff,0xca
+
+# ATT:   {nf}	decw	%dx, %dx
+# INTEL: {nf}	dec	dx, dx
+0x62,0xf4,0x6d,0x1c,0xff,0xca
+
+# ATT:   {evex}	decl	%ecx
+# INTEL: {evex}	dec	ecx
+0x62,0xf4,0x7c,0x08,0xff,0xc9
+
+# ATT:   {nf}	decl	%ecx
+# INTEL: {nf}	dec	ecx
+0x62,0xf4,0x7c,0x0c,0xff,0xc9
+
+# ATT:   decl	%ecx, %ecx
+# INTEL: dec	ecx, ecx
+0x62,0xf4,0x74,0x18,0xff,0xc9
+
+# ATT:   {nf}	decl	%ecx, %ecx
+# INTEL: {nf}	dec	ecx, ecx
+0x62,0xf4,0x74,0x1c,0xff,0xc9
+
+# ATT:   {evex}	decq	%r9
+# INTEL: {evex}	dec	r9
+0x62,0xd4,0xfc,0x08,0xff,0xc9
+
+# ATT:   {nf}	decq	%r9
+# INTEL: {nf}	dec	r9
+0x62,0xd4,0xfc,0x0c,0xff,0xc9
+
+# ATT:   decq	%r9, %r9
+# INTEL: dec	r9, r9
+0x62,0xd4,0xb4,0x18,0xff,0xc9
+
+# ATT:   {nf}	decq	%r9, %r9
+# INTEL: {nf}	dec	r9, r9
+0x62,0xd4,0xb4,0x1c,0xff,0xc9
+
+# ATT:   {evex}	decb	291(%r8,%rax,4)
+# INTEL: {evex}	dec	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	decb	291(%r8,%rax,4)
+# INTEL: {nf}	dec	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   decb	291(%r8,%rax,4), %bl
+# INTEL: dec	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	decb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	dec	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x1c,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	decw	291(%r8,%rax,4)
+# INTEL: {evex}	dec	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	decw	291(%r8,%rax,4)
+# INTEL: {nf}	dec	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   decw	291(%r8,%rax,4), %dx
+# INTEL: dec	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	decw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	dec	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	decl	291(%r8,%rax,4)
+# INTEL: {evex}	dec	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	decl	291(%r8,%rax,4)
+# INTEL: {nf}	dec	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   decl	291(%r8,%rax,4), %ecx
+# INTEL: dec	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	decl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	dec	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	decq	291(%r8,%rax,4)
+# INTEL: {evex}	dec	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	decq	291(%r8,%rax,4)
+# INTEL: {nf}	dec	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   decq	291(%r8,%rax,4), %r9
+# INTEL: dec	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	decq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	dec	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/inc.txt b/llvm/test/MC/Disassembler/X86/apx/inc.txt
new file mode 100644
index 0000000000000..0470319b4f6cd
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/inc.txt
@@ -0,0 +1,130 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	incb	%bl
+# INTEL: {evex}	inc	bl
+0x62,0xf4,0x7c,0x08,0xfe,0xc3
+
+# ATT:   {nf}	incb	%bl
+# INTEL: {nf}	inc	bl
+0x62,0xf4,0x7c,0x0c,0xfe,0xc3
+
+# ATT:   incb	%bl, %bl
+# INTEL: inc	bl, bl
+0x62,0xf4,0x64,0x18,0xfe,0xc3
+
+# ATT:   {nf}	incb	%bl, %bl
+# INTEL: {nf}	inc	bl, bl
+0x62,0xf4,0x64,0x1c,0xfe,0xc3
+
+# ATT:   {evex}	incw	%dx
+# INTEL: {evex}	inc	dx
+0x62,0xf4,0x7d,0x08,0xff,0xc2
+
+# ATT:   {nf}	incw	%dx
+# INTEL: {nf}	inc	dx
+0x62,0xf4,0x7d,0x0c,0xff,0xc2
+
+# ATT:   incw	%dx, %dx
+# INTEL: inc	dx, dx
+0x62,0xf4,0x6d,0x18,0xff,0xc2
+
+# ATT:   {nf}	incw	%dx, %dx
+# INTEL: {nf}	inc	dx, dx
+0x62,0xf4,0x6d,0x1c,0xff,0xc2
+
+# ATT:   {evex}	incl	%ecx
+# INTEL: {evex}	inc	ecx
+0x62,0xf4,0x7c,0x08,0xff,0xc1
+
+# ATT:   {nf}	incl	%ecx
+# INTEL: {nf}	inc	ecx
+0x62,0xf4,0x7c,0x0c,0xff,0xc1
+
+# ATT:   incl	%ecx, %ecx
+# INTEL: inc	ecx, ecx
+0x62,0xf4,0x74,0x18,0xff,0xc1
+
+# ATT:   {nf}	incl	%ecx, %ecx
+# INTEL: {nf}	inc	ecx, ecx
+0x62,0xf4,0x74,0x1c,0xff,0xc1
+
+# ATT:   {evex}	incq	%r9
+# INTEL: {evex}	inc	r9
+0x62,0xd4,0xfc,0x08,0xff,0xc1
+
+# ATT:   {nf}	incq	%r9
+# INTEL: {nf}	inc	r9
+0x62,0xd4,0xfc,0x0c,0xff,0xc1
+
+# ATT:   incq	%r9, %r9
+# INTEL: inc	r9, r9
+0x62,0xd4,0xb4,0x18,0xff,0xc1
+
+# ATT:   {nf}	incq	%r9, %r9
+# INTEL: {nf}	inc	r9, r9
+0x62,0xd4,0xb4,0x1c,0xff,0xc1
+
+# ATT:   {evex}	incb	291(%r8,%rax,4)
+# INTEL: {evex}	inc	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xfe,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	incb	291(%r8,%rax,4)
+# INTEL: {nf}	inc	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xfe,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   incb	291(%r8,%rax,4), %bl
+# INTEL: inc	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xfe,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	incb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	inc	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x1c,0xfe,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	incw	291(%r8,%rax,4)
+# INTEL: {evex}	inc	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	incw	291(%r8,%rax,4)
+# INTEL: {nf}	inc	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   incw	291(%r8,%rax,4), %dx
+# INTEL: inc	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	incw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	inc	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	incl	291(%r8,%rax,4)
+# INTEL: {evex}	inc	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	incl	291(%r8,%rax,4)
+# INTEL: {nf}	inc	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   incl	291(%r8,%rax,4), %ecx
+# INTEL: inc	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	incl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	inc	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	incq	291(%r8,%rax,4)
+# INTEL: {evex}	inc	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	incq	291(%r8,%rax,4)
+# INTEL: {nf}	inc	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   incq	291(%r8,%rax,4), %r9
+# INTEL: inc	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	incq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	inc	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/X86/apx/adx-att.s b/llvm/test/MC/X86/apx/adx-att.s
new file mode 100644
index 0000000000000..185d27897f816
--- /dev/null
+++ b/llvm/test/MC/X86/apx/adx-att.s
@@ -0,0 +1,53 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-16: error:
+# ERROR-NOT: error:
+# CHECK: adcxl	%r16d, %r17d
+# CHECK: encoding: [0x62,0xec,0x7d,0x08,0x66,0xc8]
+         adcxl	%r16d, %r17d
+# CHECK: adcxl	%r16d, %r17d, %r18d
+# CHECK: encoding: [0x62,0xec,0x6d,0x10,0x66,0xc8]
+         adcxl	%r16d, %r17d, %r18d
+# CHECK: adcxq	%r16, %r17
+# CHECK: encoding: [0x62,0xec,0xfd,0x08,0x66,0xc8]
+         adcxq	%r16, %r17
+# CHECK: adcxq	%r16, %r17, %r18
+# CHECK: encoding: [0x62,0xec,0xed,0x10,0x66,0xc8]
+         adcxq	%r16, %r17, %r18
+# CHECK: adcxl	(%r16), %r17d
+# CHECK: encoding: [0x62,0xec,0x7d,0x08,0x66,0x08]
+         adcxl	(%r16), %r17d
+# CHECK: adcxl	(%r16), %r17d, %r18d
+# CHECK: encoding: [0x62,0xec,0x6d,0x10,0x66,0x08]
+         adcxl	(%r16), %r17d, %r18d
+# CHECK: adcxq	(%r16), %r17
+# CHECK: encoding: [0x62,0xec,0xfd,0x08,0x66,0x08]
+         adcxq	(%r16), %r17
+# CHECK: adcxq	(%r16), %r17, %r18
+# CHECK: encoding: [0x62,0xec,0xed,0x10,0x66,0x08]
+         adcxq	(%r16), %r17, %r18
+# CHECK: adoxl	%r16d, %r17d
+# CHECK: encoding: [0x62,0xec,0x7e,0x08,0x66,0xc8]
+         adoxl	%r16d, %r17d
+# CHECK: adoxl	%r16d, %r17d, %r18d
+# CHECK: encoding: [0x62,0xec,0x6e,0x10,0x66,0xc8]
+         adoxl	%r16d, %r17d, %r18d
+# CHECK: adoxq	%r16, %r17
+# CHECK: encoding: [0x62,0xec,0xfe,0x08,0x66,0xc8]
+         adoxq	%r16, %r17
+# CHECK: adoxq	%r16, %r17, %r18
+# CHECK: encoding: [0x62,0xec,0xee,0x10,0x66,0xc8]
+         adoxq	%r16, %r17, %r18
+# CHECK: adoxl	(%r16), %r17d
+# CHECK: encoding: [0x62,0xec,0x7e,0x08,0x66,0x08]
+         adoxl	(%r16), %r17d
+# CHECK: adoxl	(%r16), %r17d, %r18d
+# CHECK: encoding: [0x62,0xec,0x6e,0x10,0x66,0x08]
+         adoxl	(%r16), %r17d, %r18d
+# CHECK: adoxq	(%r16), %r17
+# CHECK: encoding: [0x62,0xec,0xfe,0x08,0x66,0x08]
+         adoxq	(%r16), %r17
+# CHECK: adoxq	(%r16), %r17, %r18
+# CHECK: encoding: [0x62,0xec,0xee,0x10,0x66,0x08]
+         adoxq	(%r16), %r17, %r18
diff --git a/llvm/test/MC/X86/apx/adx-intel.s b/llvm/test/MC/X86/apx/adx-intel.s
new file mode 100644
index 0000000000000..0cbe9df1852cc
--- /dev/null
+++ b/llvm/test/MC/X86/apx/adx-intel.s
@@ -0,0 +1,50 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: adcx	r17d, r16d
+# CHECK: encoding: [0x62,0xec,0x7d,0x08,0x66,0xc8]
+         adcx	r17d, r16d
+# CHECK: adcx	r18d, r17d, r16d
+# CHECK: encoding: [0x62,0xec,0x6d,0x10,0x66,0xc8]
+         adcx	r18d, r17d, r16d
+# CHECK: adcx	r17, r16
+# CHECK: encoding: [0x62,0xec,0xfd,0x08,0x66,0xc8]
+         adcx	r17, r16
+# CHECK: adcx	r18, r17, r16
+# CHECK: encoding: [0x62,0xec,0xed,0x10,0x66,0xc8]
+         adcx	r18, r17, r16
+# CHECK: adcx	r17d, dword ptr [r16]
+# CHECK: encoding: [0x62,0xec,0x7d,0x08,0x66,0x08]
+         adcx	r17d, dword ptr [r16]
+# CHECK: adcx	r18d, r17d, dword ptr [r16]
+# CHECK: encoding: [0x62,0xec,0x6d,0x10,0x66,0x08]
+         adcx	r18d, r17d, dword ptr [r16]
+# CHECK: adcx	r17, qword ptr [r16]
+# CHECK: encoding: [0x62,0xec,0xfd,0x08,0x66,0x08]
+         adcx	r17, qword ptr [r16]
+# CHECK: adcx	r18, r17, qword ptr [r16]
+# CHECK: encoding: [0x62,0xec,0xed,0x10,0x66,0x08]
+         adcx	r18, r17, qword ptr [r16]
+# CHECK: adox	r17d, r16d
+# CHECK: encoding: [0x62,0xec,0x7e,0x08,0x66,0xc8]
+         adox	r17d, r16d
+# CHECK: adox	r18d, r17d, r16d
+# CHECK: encoding: [0x62,0xec,0x6e,0x10,0x66,0xc8]
+         adox	r18d, r17d, r16d
+# CHECK: adox	r17, r16
+# CHECK: encoding: [0x62,0xec,0xfe,0x08,0x66,0xc8]
+         adox	r17, r16
+# CHECK: adox	r18, r17, r16
+# CHECK: encoding: [0x62,0xec,0xee,0x10,0x66,0xc8]
+         adox	r18, r17, r16
+# CHECK: adox	r17d, dword ptr [r16]
+# CHECK: encoding: [0x62,0xec,0x7e,0x08,0x66,0x08]
+         adox	r17d, dword ptr [r16]
+# CHECK: adox	r18d, r17d, dword ptr [r16]
+# CHECK: encoding: [0x62,0xec,0x6e,0x10,0x66,0x08]
+         adox	r18d, r17d, dword ptr [r16]
+# CHECK: adox	r17, qword ptr [r16]
+# CHECK: encoding: [0x62,0xec,0xfe,0x08,0x66,0x08]
+         adox	r17, qword ptr [r16]
+# CHECK: adox	r18, r17, qword ptr [r16]
+# CHECK: encoding: [0x62,0xec,0xee,0x10,0x66,0x08]
+         adox	r18, r17, qword ptr [r16]
diff --git a/llvm/test/MC/X86/apx/dec-att.s b/llvm/test/MC/X86/apx/dec-att.s
new file mode 100644
index 0000000000000..a5d3cc49b1135
--- /dev/null
+++ b/llvm/test/MC/X86/apx/dec-att.s
@@ -0,0 +1,101 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-32: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	decb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xfe,0xcb]
+         {evex}	decb	%bl
+# CHECK: {nf}	decb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xfe,0xcb]
+         {nf}	decb	%bl
+# CHECK: decb	%bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xfe,0xcb]
+         decb	%bl, %bl
+# CHECK: {nf}	decb	%bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xfe,0xcb]
+         {nf}	decb	%bl, %bl
+# CHECK: {evex}	decw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xff,0xca]
+         {evex}	decw	%dx
+# CHECK: {nf}	decw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xff,0xca]
+         {nf}	decw	%dx
+# CHECK: decw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xff,0xca]
+         decw	%dx, %dx
+# CHECK: {nf}	decw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xff,0xca]
+         {nf}	decw	%dx, %dx
+# CHECK: {evex}	decl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xff,0xc9]
+         {evex}	decl	%ecx
+# CHECK: {nf}	decl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xff,0xc9]
+         {nf}	decl	%ecx
+# CHECK: decl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xff,0xc9]
+         decl	%ecx, %ecx
+# CHECK: {nf}	decl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xff,0xc9]
+         {nf}	decl	%ecx, %ecx
+# CHECK: {evex}	decq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xff,0xc9]
+         {evex}	decq	%r9
+# CHECK: {nf}	decq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xff,0xc9]
+         {nf}	decq	%r9
+# CHECK: decq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xff,0xc9]
+         decq	%r9, %r9
+# CHECK: {nf}	decq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xff,0xc9]
+         {nf}	decq	%r9, %r9
+# CHECK: {evex}	decb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	decb	291(%r8,%rax,4)
+# CHECK: {nf}	decb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	decb	291(%r8,%rax,4)
+# CHECK: decb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00]
+         decb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	decb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	decb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	decw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	decw	291(%r8,%rax,4)
+# CHECK: {nf}	decw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	decw	291(%r8,%rax,4)
+# CHECK: decw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         decw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	decw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	decw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	decl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	decl	291(%r8,%rax,4)
+# CHECK: {nf}	decl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	decl	291(%r8,%rax,4)
+# CHECK: decl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         decl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	decl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	decl	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	decq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	decq	291(%r8,%rax,4)
+# CHECK: {nf}	decq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	decq	291(%r8,%rax,4)
+# CHECK: decq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         decq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	decq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	decq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/dec-intel.s b/llvm/test/MC/X86/apx/dec-intel.s
new file mode 100644
index 0000000000000..5fde06d024efb
--- /dev/null
+++ b/llvm/test/MC/X86/apx/dec-intel.s
@@ -0,0 +1,98 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	dec	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xfe,0xcb]
+         {evex}	dec	bl
+# CHECK: {nf}	dec	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xfe,0xcb]
+         {nf}	dec	bl
+# CHECK: dec	bl, bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xfe,0xcb]
+         dec	bl, bl
+# CHECK: {nf}	dec	bl, bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xfe,0xcb]
+         {nf}	dec	bl, bl
+# CHECK: {evex}	dec	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xff,0xca]
+         {evex}	dec	dx
+# CHECK: {nf}	dec	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xff,0xca]
+         {nf}	dec	dx
+# CHECK: dec	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xff,0xca]
+         dec	dx, dx
+# CHECK: {nf}	dec	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xff,0xca]
+         {nf}	dec	dx, dx
+# CHECK: {evex}	dec	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xff,0xc9]
+         {evex}	dec	ecx
+# CHECK: {nf}	dec	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xff,0xc9]
+         {nf}	dec	ecx
+# CHECK: dec	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xff,0xc9]
+         dec	ecx, ecx
+# CHECK: {nf}	dec	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xff,0xc9]
+         {nf}	dec	ecx, ecx
+# CHECK: {evex}	dec	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xff,0xc9]
+         {evex}	dec	r9
+# CHECK: {nf}	dec	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xff,0xc9]
+         {nf}	dec	r9
+# CHECK: dec	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xff,0xc9]
+         dec	r9, r9
+# CHECK: {nf}	dec	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xff,0xc9]
+         {nf}	dec	r9, r9
+# CHECK: {evex}	dec	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	dec	byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	dec	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	dec	byte ptr [r8 + 4*rax + 291]
+# CHECK: dec	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00]
+         dec	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	dec	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xfe,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	dec	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	dec	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	dec	word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	dec	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	dec	word ptr [r8 + 4*rax + 291]
+# CHECK: dec	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         dec	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	dec	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	dec	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	dec	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	dec	dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	dec	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	dec	dword ptr [r8 + 4*rax + 291]
+# CHECK: dec	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         dec	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	dec	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	dec	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	dec	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	dec	qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	dec	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	dec	qword ptr [r8 + 4*rax + 291]
+# CHECK: dec	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         dec	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	dec	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xff,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	dec	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/inc-att.s b/llvm/test/MC/X86/apx/inc-att.s
new file mode 100644
index 0000000000000..5e08ae3089496
--- /dev/null
+++ b/llvm/test/MC/X86/apx/inc-att.s
@@ -0,0 +1,101 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-32: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	incb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xfe,0xc3]
+         {evex}	incb	%bl
+# CHECK: {nf}	incb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xfe,0xc3]
+         {nf}	incb	%bl
+# CHECK: incb	%bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xfe,0xc3]
+         incb	%bl, %bl
+# CHECK: {nf}	incb	%bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xfe,0xc3]
+         {nf}	incb	%bl, %bl
+# CHECK: {evex}	incw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xff,0xc2]
+         {evex}	incw	%dx
+# CHECK: {nf}	incw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xff,0xc2]
+         {nf}	incw	%dx
+# CHECK: incw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xff,0xc2]
+         incw	%dx, %dx
+# CHECK: {nf}	incw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xff,0xc2]
+         {nf}	incw	%dx, %dx
+# CHECK: {evex}	incl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xff,0xc1]
+         {evex}	incl	%ecx
+# CHECK: {nf}	incl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xff,0xc1]
+         {nf}	incl	%ecx
+# CHECK: incl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xff,0xc1]
+         incl	%ecx, %ecx
+# CHECK: {nf}	incl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xff,0xc1]
+         {nf}	incl	%ecx, %ecx
+# CHECK: {evex}	incq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xff,0xc1]
+         {evex}	incq	%r9
+# CHECK: {nf}	incq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xff,0xc1]
+         {nf}	incq	%r9
+# CHECK: incq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xff,0xc1]
+         incq	%r9, %r9
+# CHECK: {nf}	incq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xff,0xc1]
+         {nf}	incq	%r9, %r9
+# CHECK: {evex}	incb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xfe,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	incb	291(%r8,%rax,4)
+# CHECK: {nf}	incb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xfe,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	incb	291(%r8,%rax,4)
+# CHECK: incb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xfe,0x84,0x80,0x23,0x01,0x00,0x00]
+         incb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	incb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xfe,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	incb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	incw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	incw	291(%r8,%rax,4)
+# CHECK: {nf}	incw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	incw	291(%r8,%rax,4)
+# CHECK: incw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         incw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	incw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	incw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	incl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	incl	291(%r8,%rax,4)
+# CHECK: {nf}	incl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	incl	291(%r8,%rax,4)
+# CHECK: incl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         incl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	incl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	incl	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	incq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	incq	291(%r8,%rax,4)
+# CHECK: {nf}	incq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	incq	291(%r8,%rax,4)
+# CHECK: incq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         incq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	incq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	incq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/inc-intel.s b/llvm/test/MC/X86/apx/inc-intel.s
new file mode 100644
index 0000000000000..e35ce06433bd2
--- /dev/null
+++ b/llvm/test/MC/X86/apx/inc-intel.s
@@ -0,0 +1,98 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	inc	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xfe,0xc3]
+         {evex}	inc	bl
+# CHECK: {nf}	inc	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xfe,0xc3]
+         {nf}	inc	bl
+# CHECK: inc	bl, bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xfe,0xc3]
+         inc	bl, bl
+# CHECK: {nf}	inc	bl, bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xfe,0xc3]
+         {nf}	inc	bl, bl
+# CHECK: {evex}	inc	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xff,0xc2]
+         {evex}	inc	dx
+# CHECK: {nf}	inc	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xff,0xc2]
+         {nf}	inc	dx
+# CHECK: inc	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xff,0xc2]
+         inc	dx, dx
+# CHECK: {nf}	inc	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xff,0xc2]
+         {nf}	inc	dx, dx
+# CHECK: {evex}	inc	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xff,0xc1]
+         {evex}	inc	ecx
+# CHECK: {nf}	inc	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xff,0xc1]
+         {nf}	inc	ecx
+# CHECK: inc	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xff,0xc1]
+         inc	ecx, ecx
+# CHECK: {nf}	inc	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xff,0xc1]
+         {nf}	inc	ecx, ecx
+# CHECK: {evex}	inc	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xff,0xc1]
+         {evex}	inc	r9
+# CHECK: {nf}	inc	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xff,0xc1]
+         {nf}	inc	r9
+# CHECK: inc	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xff,0xc1]
+         inc	r9, r9
+# CHECK: {nf}	inc	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xff,0xc1]
+         {nf}	inc	r9, r9
+# CHECK: {evex}	inc	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xfe,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	inc	byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	inc	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xfe,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	inc	byte ptr [r8 + 4*rax + 291]
+# CHECK: inc	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xfe,0x84,0x80,0x23,0x01,0x00,0x00]
+         inc	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	inc	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xfe,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	inc	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	inc	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	inc	word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	inc	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	inc	word ptr [r8 + 4*rax + 291]
+# CHECK: inc	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         inc	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	inc	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	inc	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	inc	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	inc	dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	inc	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	inc	dword ptr [r8 + 4*rax + 291]
+# CHECK: inc	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         inc	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	inc	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	inc	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	inc	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	inc	qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	inc	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	inc	qword ptr [r8 + 4*rax + 291]
+# CHECK: inc	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         inc	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	inc	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xff,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	inc	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 8ac46a5b66924..4e37285b08ba2 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -75,13 +75,21 @@ static const X86FoldTableEntry Table2Addr[] = {
   {X86::BTS32ri8, X86::BTS32mi8, TB_NO_REVERSE},
   {X86::BTS64ri8, X86::BTS64mi8, TB_NO_REVERSE},
   {X86::DEC16r, X86::DEC16m, TB_NO_REVERSE},
+  {X86::DEC16r_NF, X86::DEC16m_NF, TB_NO_REVERSE},
   {X86::DEC32r, X86::DEC32m, TB_NO_REVERSE},
+  {X86::DEC32r_NF, X86::DEC32m_NF, TB_NO_REVERSE},
   {X86::DEC64r, X86::DEC64m, TB_NO_REVERSE},
+  {X86::DEC64r_NF, X86::DEC64m_NF, TB_NO_REVERSE},
   {X86::DEC8r, X86::DEC8m, TB_NO_REVERSE},
+  {X86::DEC8r_NF, X86::DEC8m_NF, TB_NO_REVERSE},
   {X86::INC16r, X86::INC16m, TB_NO_REVERSE},
+  {X86::INC16r_NF, X86::INC16m_NF, TB_NO_REVERSE},
   {X86::INC32r, X86::INC32m, TB_NO_REVERSE},
+  {X86::INC32r_NF, X86::INC32m_NF, TB_NO_REVERSE},
   {X86::INC64r, X86::INC64m, TB_NO_REVERSE},
+  {X86::INC64r_NF, X86::INC64m_NF, TB_NO_REVERSE},
   {X86::INC8r, X86::INC8m, TB_NO_REVERSE},
+  {X86::INC8r_NF, X86::INC8m_NF, TB_NO_REVERSE},
   {X86::NEG16r, X86::NEG16m, TB_NO_REVERSE},
   {X86::NEG16r_NF, X86::NEG16m_NF, TB_NO_REVERSE},
   {X86::NEG32r, X86::NEG32m, TB_NO_REVERSE},
@@ -604,12 +612,28 @@ static const X86FoldTableEntry Table1[] = {
   {X86::CVTTSS2SI64rr_Int, X86::CVTTSS2SI64rm_Int, TB_NO_REVERSE},
   {X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0},
   {X86::CVTTSS2SIrr_Int, X86::CVTTSS2SIrm_Int, TB_NO_REVERSE},
+  {X86::DEC16r_ND, X86::DEC16m_ND, 0},
+  {X86::DEC16r_NF_ND, X86::DEC16m_NF_ND, 0},
+  {X86::DEC32r_ND, X86::DEC32m_ND, 0},
+  {X86::DEC32r_NF_ND, X86::DEC32m_NF_ND, 0},
+  {X86::DEC64r_ND, X86::DEC64m_ND, 0},
+  {X86::DEC64r_NF_ND, X86::DEC64m_NF_ND, 0},
+  {X86::DEC8r_ND, X86::DEC8m_ND, 0},
+  {X86::DEC8r_NF_ND, X86::DEC8m_NF_ND, 0},
   {X86::IMUL16rri, X86::IMUL16rmi, 0},
   {X86::IMUL16rri8, X86::IMUL16rmi8, 0},
   {X86::IMUL32rri, X86::IMUL32rmi, 0},
   {X86::IMUL32rri8, X86::IMUL32rmi8, 0},
   {X86::IMUL64rri32, X86::IMUL64rmi32, 0},
   {X86::IMUL64rri8, X86::IMUL64rmi8, 0},
+  {X86::INC16r_ND, X86::INC16m_ND, 0},
+  {X86::INC16r_NF_ND, X86::INC16m_NF_ND, 0},
+  {X86::INC32r_ND, X86::INC32m_ND, 0},
+  {X86::INC32r_NF_ND, X86::INC32m_NF_ND, 0},
+  {X86::INC64r_ND, X86::INC64m_ND, 0},
+  {X86::INC64r_NF_ND, X86::INC64m_NF_ND, 0},
+  {X86::INC8r_ND, X86::INC8m_ND, 0},
+  {X86::INC8r_NF_ND, X86::INC8m_NF_ND, 0},
   {X86::KMOVBkk, X86::KMOVBkm, TB_NO_REVERSE},
   {X86::KMOVBkk_EVEX, X86::KMOVBkm_EVEX, TB_NO_REVERSE},
   {X86::KMOVDkk, X86::KMOVDkm, 0},
@@ -1533,7 +1557,11 @@ static const X86FoldTableEntry Table2[] = {
   {X86::ADC8rr, X86::ADC8rm, 0},
   {X86::ADC8rr_ND, X86::ADC8rm_ND, 0},
   {X86::ADCX32rr, X86::ADCX32rm, 0},
+  {X86::ADCX32rr_EVEX, X86::ADCX32rm_EVEX, 0},
+  {X86::ADCX32rr_ND, X86::ADCX32rm_ND, 0},
   {X86::ADCX64rr, X86::ADCX64rm, 0},
+  {X86::ADCX64rr_EVEX, X86::ADCX64rm_EVEX, 0},
+  {X86::ADCX64rr_ND, X86::ADCX64rm_ND, 0},
   {X86::ADD16rr, X86::ADD16rm, 0},
   {X86::ADD16rr_ND, X86::ADD16rm_ND, 0},
   {X86::ADD16rr_NF, X86::ADD16rm_NF, 0},
@@ -1559,7 +1587,11 @@ static const X86FoldTableEntry Table2[] = {
   {X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16},
   {X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16},
   {X86::ADOX32rr, X86::ADOX32rm, 0},
+  {X86::ADOX32rr_EVEX, X86::ADOX32rm_EVEX, 0},
+  {X86::ADOX32rr_ND, X86::ADOX32rm_ND, 0},
   {X86::ADOX64rr, X86::ADOX64rm, 0},
+  {X86::ADOX64rr_EVEX, X86::ADOX64rm_EVEX, 0},
+  {X86::ADOX64rr_ND, X86::ADOX64rm_ND, 0},
   {X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16},
   {X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16},
   {X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16},

From ddf0096a92e4c2852fd57321f02cbd78e596943c Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Thu, 4 Jan 2024 10:42:27 +0800
Subject: [PATCH 186/313] [NFC][X86] Reorg MC tests for APX promoted instrs
 (#76697)

As suggested in https://github.com/llvm/llvm-project/pull/76210, this
patch re-organize the mc tests for apx promoted instrs, instr tests
within same cpuid would be listed in one test.
Also add explicit prefix {evex} tests and 8 displacement memory test,
promoted instrs need set No_CD8 to avoid AVX512 compress encoding.
---
 llvm/lib/Target/X86/X86InstrMisc.td           |  12 +-
 llvm/lib/Target/X86/X86InstrShiftRotate.td    |   2 +-
 .../test/MC/Disassembler/X86/apx/amx-tile.txt |  30 +++
 llvm/test/MC/Disassembler/X86/apx/bmi2.txt    | 240 +++++++++++++++++
 llvm/test/MC/Disassembler/X86/apx/cet.txt     |  42 +++
 .../MC/Disassembler/X86/apx/cmpccxadd.txt     | 184 ++++++++++---
 llvm/test/MC/Disassembler/X86/apx/invept.txt  |   6 -
 llvm/test/MC/Disassembler/X86/apx/invpcid.txt |   4 +
 llvm/test/MC/Disassembler/X86/apx/invvpid.txt |   6 -
 .../MC/Disassembler/X86/apx/movdir64b.txt     |   8 +
 llvm/test/MC/Disassembler/X86/apx/movdiri.txt |   8 +
 llvm/test/MC/Disassembler/X86/apx/mulx.txt    |  18 --
 llvm/test/MC/Disassembler/X86/apx/rorx.txt    |  18 --
 llvm/test/MC/Disassembler/X86/apx/sarx.txt    |  18 --
 llvm/test/MC/Disassembler/X86/apx/sha.txt     | 124 +++++++++
 .../test/MC/Disassembler/X86/apx/sha1msg1.txt |  10 -
 .../test/MC/Disassembler/X86/apx/sha1msg2.txt |  10 -
 .../MC/Disassembler/X86/apx/sha1nexte.txt     |  10 -
 .../MC/Disassembler/X86/apx/sha1rnds4.txt     |  10 -
 .../MC/Disassembler/X86/apx/sha256msg1.txt    |  10 -
 .../MC/Disassembler/X86/apx/sha256msg2.txt    |  10 -
 .../MC/Disassembler/X86/apx/sha256rnds2.txt   |  10 -
 llvm/test/MC/Disassembler/X86/apx/shlx.txt    |  18 --
 llvm/test/MC/Disassembler/X86/apx/shrx.txt    |  18 --
 llvm/test/MC/Disassembler/X86/apx/vmx.txt     |  22 ++
 llvm/test/MC/Disassembler/X86/apx/wrssd.txt   |   6 -
 llvm/test/MC/Disassembler/X86/apx/wrssq.txt   |   6 -
 llvm/test/MC/Disassembler/X86/apx/wrussd.txt  |   6 -
 llvm/test/MC/Disassembler/X86/apx/wrussq.txt  |   6 -
 llvm/test/MC/X86/apx/amx-tile-att.s           |  33 ++-
 llvm/test/MC/X86/apx/amx-tile-intel.s         |  30 +++
 llvm/test/MC/X86/apx/bmi2-att.s               | 243 ++++++++++++++++++
 llvm/test/MC/X86/apx/bmi2-intel.s             | 239 +++++++++++++++++
 llvm/test/MC/X86/apx/cet-att.s                |  45 ++++
 llvm/test/MC/X86/apx/cet-intel.s              |  41 +++
 llvm/test/MC/X86/apx/cmpccxadd-att.s          | 186 +++++++++++---
 llvm/test/MC/X86/apx/cmpccxadd-intel.s        | 184 ++++++++++---
 llvm/test/MC/X86/apx/invept-att.s             |   8 -
 llvm/test/MC/X86/apx/invept-intel.s           |   5 -
 llvm/test/MC/X86/apx/invpcid-att.s            |   6 +-
 llvm/test/MC/X86/apx/invpcid-intel.s          |   4 +
 llvm/test/MC/X86/apx/invvpid-att.s            |   9 -
 llvm/test/MC/X86/apx/invvpid-intel.s          |   5 -
 llvm/test/MC/X86/apx/movdir64b-att.s          |  10 +-
 llvm/test/MC/X86/apx/movdir64b-intel.s        |   8 +
 llvm/test/MC/X86/apx/movdiri-att.s            |  10 +-
 llvm/test/MC/X86/apx/movdiri-intel.s          |   8 +
 llvm/test/MC/X86/apx/mulx-att.s               |  20 --
 llvm/test/MC/X86/apx/mulx-intel.s             |  17 --
 llvm/test/MC/X86/apx/pdep-att.s               |  20 --
 llvm/test/MC/X86/apx/pdep-intel.s             |  17 --
 llvm/test/MC/X86/apx/pext-att.s               |  20 --
 llvm/test/MC/X86/apx/pext-intel.s             |  17 --
 llvm/test/MC/X86/apx/rorx-att.s               |  20 --
 llvm/test/MC/X86/apx/rorx-intel.s             |  17 --
 llvm/test/MC/X86/apx/sarx-att.s               |  20 --
 llvm/test/MC/X86/apx/sarx-intel.s             |  17 --
 llvm/test/MC/X86/apx/sha-att.s                | 127 +++++++++
 llvm/test/MC/X86/apx/sha-intel.s              | 123 +++++++++
 llvm/test/MC/X86/apx/sha1msg1-att.s           |   9 -
 llvm/test/MC/X86/apx/sha1msg1-intel.s         |   9 -
 llvm/test/MC/X86/apx/sha1msg2-att.s           |   9 -
 llvm/test/MC/X86/apx/sha1msg2-intel.s         |   9 -
 llvm/test/MC/X86/apx/sha1nexte-att.s          |   9 -
 llvm/test/MC/X86/apx/sha1nexte-intel.s        |   9 -
 llvm/test/MC/X86/apx/sha1rnds4-att.s          |   9 -
 llvm/test/MC/X86/apx/sha1rnds4-intel.s        |   9 -
 llvm/test/MC/X86/apx/sha256msg1-att.s         |   9 -
 llvm/test/MC/X86/apx/sha256msg1-intel.s       |   9 -
 llvm/test/MC/X86/apx/sha256msg2-att.s         |   9 -
 llvm/test/MC/X86/apx/sha256msg2-intel.s       |   9 -
 llvm/test/MC/X86/apx/sha256rnds2-att.s        |  13 -
 llvm/test/MC/X86/apx/sha256rnds2-intel.s      |  14 -
 llvm/test/MC/X86/apx/shlx-att.s               |  20 --
 llvm/test/MC/X86/apx/shlx-intel.s             |  17 --
 llvm/test/MC/X86/apx/shrx-att.s               |  20 --
 llvm/test/MC/X86/apx/shrx-intel.s             |  17 --
 llvm/test/MC/X86/apx/vmx-att.s                |  25 ++
 llvm/test/MC/X86/apx/vmx-intel.s              |  21 ++
 llvm/test/MC/X86/apx/wrssd-att.s              |   8 -
 llvm/test/MC/X86/apx/wrssd-intel.s            |   5 -
 llvm/test/MC/X86/apx/wrssq-att.s              |   8 -
 llvm/test/MC/X86/apx/wrssq-intel.s            |   5 -
 llvm/test/MC/X86/apx/wrussd-att.s             |   8 -
 llvm/test/MC/X86/apx/wrussd-intel.s           |   5 -
 llvm/test/MC/X86/apx/wrussq-att.s             |   8 -
 llvm/test/MC/X86/apx/wrussq-intel.s           |   5 -
 87 files changed, 1911 insertions(+), 777 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/bmi2.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/cet.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/invept.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/invvpid.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/mulx.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/rorx.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/sarx.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/sha.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/sha1msg1.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/sha1msg2.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/sha1nexte.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/sha1rnds4.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/sha256msg1.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/sha256msg2.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/sha256rnds2.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/shlx.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/shrx.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/vmx.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/wrssd.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/wrssq.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/wrussd.txt
 delete mode 100644 llvm/test/MC/Disassembler/X86/apx/wrussq.txt
 create mode 100644 llvm/test/MC/X86/apx/bmi2-att.s
 create mode 100644 llvm/test/MC/X86/apx/bmi2-intel.s
 create mode 100644 llvm/test/MC/X86/apx/cet-att.s
 create mode 100644 llvm/test/MC/X86/apx/cet-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/invept-att.s
 delete mode 100644 llvm/test/MC/X86/apx/invept-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/invvpid-att.s
 delete mode 100644 llvm/test/MC/X86/apx/invvpid-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/mulx-att.s
 delete mode 100644 llvm/test/MC/X86/apx/mulx-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/pdep-att.s
 delete mode 100644 llvm/test/MC/X86/apx/pdep-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/pext-att.s
 delete mode 100644 llvm/test/MC/X86/apx/pext-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/rorx-att.s
 delete mode 100644 llvm/test/MC/X86/apx/rorx-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/sarx-att.s
 delete mode 100644 llvm/test/MC/X86/apx/sarx-intel.s
 create mode 100644 llvm/test/MC/X86/apx/sha-att.s
 create mode 100644 llvm/test/MC/X86/apx/sha-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/sha1msg1-att.s
 delete mode 100644 llvm/test/MC/X86/apx/sha1msg1-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/sha1msg2-att.s
 delete mode 100644 llvm/test/MC/X86/apx/sha1msg2-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/sha1nexte-att.s
 delete mode 100644 llvm/test/MC/X86/apx/sha1nexte-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/sha1rnds4-att.s
 delete mode 100644 llvm/test/MC/X86/apx/sha1rnds4-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/sha256msg1-att.s
 delete mode 100644 llvm/test/MC/X86/apx/sha256msg1-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/sha256msg2-att.s
 delete mode 100644 llvm/test/MC/X86/apx/sha256msg2-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/sha256rnds2-att.s
 delete mode 100644 llvm/test/MC/X86/apx/sha256rnds2-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/shlx-att.s
 delete mode 100644 llvm/test/MC/X86/apx/shlx-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/shrx-att.s
 delete mode 100644 llvm/test/MC/X86/apx/shrx-intel.s
 create mode 100644 llvm/test/MC/X86/apx/vmx-att.s
 create mode 100644 llvm/test/MC/X86/apx/vmx-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/wrssd-att.s
 delete mode 100644 llvm/test/MC/X86/apx/wrssd-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/wrssq-att.s
 delete mode 100644 llvm/test/MC/X86/apx/wrssq-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/wrussd-att.s
 delete mode 100644 llvm/test/MC/X86/apx/wrussd-intel.s
 delete mode 100644 llvm/test/MC/X86/apx/wrussq-att.s
 delete mode 100644 llvm/test/MC/X86/apx/wrussq-intel.s

diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 772ed2a9dbe56..97c625a64cfc0 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1353,22 +1353,22 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
   def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
-                  VEX, VVVV, Sched<[WriteALU]>;
+                  NoCD8, VVVV, Sched<[WriteALU]>;
   def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
-                  VEX, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+                  NoCD8, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
 }
 
 let Predicates = [HasBMI2, NoEGPR] in {
   defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
-                               X86pdep, loadi32>, T8, XD;
+                               X86pdep, loadi32>, T8, XD, VEX;
   defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
-                               X86pdep, loadi64>, T8, XD, REX_W;
+                               X86pdep, loadi64>, T8, XD, REX_W, VEX;
   defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
-                               X86pext, loadi32>, T8, XS;
+                               X86pext, loadi32>, T8, XS, VEX;
   defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
-                               X86pext, loadi64>, T8, XS, REX_W;
+                               X86pext, loadi64>, T8, XS, REX_W, VEX;
 }
 
 let Predicates = [HasBMI2, HasEGPR] in {
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index d13e3b7af69a9..f951894db1890 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -868,7 +868,7 @@ let Predicates = [HasBMI2, NoEGPR] in {
   defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, PD, REX_W;
 }
 
-let Predicates = [HasBMI2, HasEGPR] in {
+let Predicates = [HasBMI2, HasEGPR, In64BitMode] in {
   defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem, "_EVEX">, EVEX;
   defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem, "_EVEX">, REX_W, EVEX;
   defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8, XS, EVEX;
diff --git a/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt b/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt
index 960c40cfc4b15..f2d1812801ccd 100644
--- a/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt
@@ -1,22 +1,52 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
 
+## ldtilecfg
+
+# ATT:   ldtilecfg	123(%rax,%rbx,4)
+# INTEL: ldtilecfg	[rax + 4*rbx + 123]
+0x62,0xf2,0x7c,0x08,0x49,0x44,0x98,0x7b
+
 # ATT:   ldtilecfg	291(%r28,%r29,4)
 # INTEL: ldtilecfg	[r28 + 4*r29 + 291]
 0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00
 
+## sttilecfg
+
+# ATT:   sttilecfg	123(%rax,%rbx,4)
+# INTEL: sttilecfg	[rax + 4*rbx + 123]
+0x62,0xf2,0x7d,0x08,0x49,0x44,0x98,0x7b
+
 # ATT:   sttilecfg	291(%r28,%r29,4)
 # INTEL: sttilecfg	[r28 + 4*r29 + 291]
 0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00
 
+## tileloadd
+
+# ATT:   tileloadd	123(%rax,%rbx,4), %tmm6
+# INTEL: tileloadd	tmm6, [rax + 4*rbx + 123]
+0x62,0xf2,0x7f,0x08,0x4b,0x74,0x98,0x7b
+
 # ATT:   tileloadd	291(%r28,%r29,4), %tmm6
 # INTEL: tileloadd	tmm6, [r28 + 4*r29 + 291]
 0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
 
+## tileloaddt1
+
+# ATT:   tileloaddt1	123(%rax,%rbx,4), %tmm6
+# INTEL: tileloaddt1	tmm6, [rax + 4*rbx + 123]
+0x62,0xf2,0x7d,0x08,0x4b,0x74,0x98,0x7b
+
 # ATT:   tileloaddt1	291(%r28,%r29,4), %tmm6
 # INTEL: tileloaddt1	tmm6, [r28 + 4*r29 + 291]
 0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
 
+## tilestored
+
+# ATT:   tilestored	%tmm6, 123(%rax,%rbx,4)
+# INTEL: tilestored	[rax + 4*rbx + 123], tmm6
+0x62,0xf2,0x7e,0x08,0x4b,0x74,0x98,0x7b
+
 # ATT:   tilestored	%tmm6, 291(%r28,%r29,4)
 # INTEL: tilestored	[r28 + 4*r29 + 291], tmm6
 0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/bmi2.txt b/llvm/test/MC/Disassembler/X86/apx/bmi2.txt
new file mode 100644
index 0000000000000..0fb11f4061f1b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/bmi2.txt
@@ -0,0 +1,240 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+## mulx
+
+# ATT:   mulxl	%ecx, %edx, %r10d
+# INTEL: mulx	r10d, edx, ecx
+0x62,0x72,0x6f,0x08,0xf6,0xd1
+
+# ATT:   mulxq	%r9, %r15, %r11
+# INTEL: mulx	r11, r15, r9
+0x62,0x52,0x87,0x08,0xf6,0xd9
+
+# ATT:   mulxl	123(%rax,%rbx,4), %ecx, %edx
+# INTEL: mulx	edx, ecx, dword ptr [rax + 4*rbx + 123]
+0x62,0xf2,0x77,0x08,0xf6,0x94,0x98,0x7b,0x00,0x00,0x00
+
+# ATT:   mulxq	123(%rax,%rbx,4), %r9, %r15
+# INTEL: mulx	r15, r9, qword ptr [rax + 4*rbx + 123]
+0x62,0x72,0xb7,0x08,0xf6,0xbc,0x98,0x7b,0x00,0x00,0x00
+
+# ATT:   mulxl	%r18d, %r22d, %r26d
+# INTEL: mulx	r26d, r22d, r18d
+0x62,0x6a,0x4f,0x00,0xf6,0xd2
+
+# ATT:   mulxq	%r19, %r23, %r27
+# INTEL: mulx	r27, r23, r19
+0x62,0x6a,0xc7,0x00,0xf6,0xdb
+
+# ATT:   mulxl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   mulxq	291(%r28,%r29,4), %r19, %r23
+# INTEL: mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00
+
+## pdep
+
+# ATT:   pdepl	%ecx, %edx, %r10d
+# INTEL: pdep	r10d, edx, ecx
+0x62,0x72,0x6f,0x08,0xf5,0xd1
+
+# ATT:   pdepq	%r9, %r15, %r11
+# INTEL: pdep	r11, r15, r9
+0x62,0x52,0x87,0x08,0xf5,0xd9
+
+# ATT:   pdepl	123(%rax,%rbx,4), %ecx, %edx
+# INTEL: pdep	edx, ecx, dword ptr [rax + 4*rbx + 123]
+0x62,0xf2,0x77,0x08,0xf5,0x54,0x98,0x7b
+
+# ATT:   pdepq	123(%rax,%rbx,4), %r9, %r15
+# INTEL: pdep	r15, r9, qword ptr [rax + 4*rbx + 123]
+0x62,0x72,0xb7,0x08,0xf5,0x7c,0x98,0x7b
+
+# ATT:   pdepl	%r18d, %r22d, %r26d
+# INTEL: pdep	r26d, r22d, r18d
+0x62,0x6a,0x4f,0x00,0xf5,0xd2
+
+# ATT:   pdepq	%r19, %r23, %r27
+# INTEL: pdep	r27, r23, r19
+0x62,0x6a,0xc7,0x00,0xf5,0xdb
+
+# ATT:   pdepl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: pdep	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0x6b,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   pdepq	291(%r28,%r29,4), %r19, %r23
+# INTEL: pdep	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0xe3,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00
+
+## pext
+
+# ATT:   pextl	%ecx, %edx, %r10d
+# INTEL: pext	r10d, edx, ecx
+0x62,0x72,0x6e,0x08,0xf5,0xd1
+
+# ATT:   pextq	%r9, %r15, %r11
+# INTEL: pext	r11, r15, r9
+0x62,0x52,0x86,0x08,0xf5,0xd9
+
+# ATT:   pextl	123(%rax,%rbx,4), %ecx, %edx
+# INTEL: pext	edx, ecx, dword ptr [rax + 4*rbx + 123]
+0x62,0xf2,0x76,0x08,0xf5,0x54,0x98,0x7b
+
+# ATT:   pextq	123(%rax,%rbx,4), %r9, %r15
+# INTEL: pext	r15, r9, qword ptr [rax + 4*rbx + 123]
+0x62,0x72,0xb6,0x08,0xf5,0x7c,0x98,0x7b
+
+# ATT:   pextl	%r18d, %r22d, %r26d
+# INTEL: pext	r26d, r22d, r18d
+0x62,0x6a,0x4e,0x00,0xf5,0xd2
+
+# ATT:   pextq	%r19, %r23, %r27
+# INTEL: pext	r27, r23, r19
+0x62,0x6a,0xc6,0x00,0xf5,0xdb
+
+# ATT:   pextl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: pext	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0x6a,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   pextq	291(%r28,%r29,4), %r19, %r23
+# INTEL: pext	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0xe2,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00
+
+## rorx
+
+# ATT:   rorxl	$123, %ecx, %edx
+# INTEL: rorx	edx, ecx, 123
+0x62,0xf3,0x7f,0x08,0xf0,0xd1,0x7b
+
+# ATT:   rorxq	$123, %r9, %r15
+# INTEL: rorx	r15, r9, 123
+0x62,0x53,0xff,0x08,0xf0,0xf9,0x7b
+
+# ATT:   rorxl	$123, 123(%rax,%rbx,4), %ecx
+# INTEL: rorx	ecx, dword ptr [rax + 4*rbx + 123], 123
+0x62,0xf3,0x7f,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b
+
+# ATT:   rorxq	$123, 123(%rax,%rbx,4), %r9
+# INTEL: rorx	r9, qword ptr [rax + 4*rbx + 123], 123
+0x62,0x73,0xff,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b
+
+# ATT:   rorxl	$123, %r18d, %r22d
+# INTEL: rorx	r22d, r18d, 123
+0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b
+
+# ATT:   rorxq	$123, %r19, %r23
+# INTEL: rorx	r23, r19, 123
+0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b
+
+# ATT:   rorxl	$123, 291(%r28,%r29,4), %r18d
+# INTEL: rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
+0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rorxq	$123, 291(%r28,%r29,4), %r19
+# INTEL: rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
+0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b
+
+## sarx
+
+# ATT:   sarxl	%ecx, %edx, %r10d
+# INTEL: sarx	r10d, edx, ecx
+0x62,0x72,0x76,0x08,0xf7,0xd2
+
+# ATT:   sarxl	%ecx, 123(%rax,%rbx,4), %edx
+# INTEL: sarx	edx, dword ptr [rax + 4*rbx + 123], ecx
+0x62,0xf2,0x76,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00
+
+# ATT:   sarxq	%r9, %r15, %r11
+# INTEL: sarx	r11, r15, r9
+0x62,0x52,0xb6,0x08,0xf7,0xdf
+
+# ATT:   sarxq	%r9, 123(%rax,%rbx,4), %r15
+# INTEL: sarx	r15, qword ptr [rax + 4*rbx + 123], r9
+0x62,0x72,0xb6,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00
+
+# ATT:   sarxl	%r18d, %r22d, %r26d
+# INTEL: sarx	r26d, r22d, r18d
+0x62,0x6a,0x6e,0x00,0xf7,0xd6
+
+# ATT:   sarxl	%r18d, 291(%r28,%r29,4), %r22d
+# INTEL: sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   sarxq	%r19, %r23, %r27
+# INTEL: sarx	r27, r23, r19
+0x62,0x6a,0xe6,0x00,0xf7,0xdf
+
+# ATT:   sarxq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
+
+## shlx
+
+# ATT:   shlxl	%ecx, %edx, %r10d
+# INTEL: shlx	r10d, edx, ecx
+0x62,0x72,0x75,0x08,0xf7,0xd2
+
+# ATT:   shlxl	%ecx, 123(%rax,%rbx,4), %edx
+# INTEL: shlx	edx, dword ptr [rax + 4*rbx + 123], ecx
+0x62,0xf2,0x75,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00
+
+# ATT:   shlxq	%r9, %r15, %r11
+# INTEL: shlx	r11, r15, r9
+0x62,0x52,0xb5,0x08,0xf7,0xdf
+
+# ATT:   shlxq	%r9, 123(%rax,%rbx,4), %r15
+# INTEL: shlx	r15, qword ptr [rax + 4*rbx + 123], r9
+0x62,0x72,0xb5,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00
+
+# ATT:   shlxl	%r18d, %r22d, %r26d
+# INTEL: shlx	r26d, r22d, r18d
+0x62,0x6a,0x6d,0x00,0xf7,0xd6
+
+# ATT:   shlxl	%r18d, 291(%r28,%r29,4), %r22d
+# INTEL: shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   shlxq	%r19, %r23, %r27
+# INTEL: shlx	r27, r23, r19
+0x62,0x6a,0xe5,0x00,0xf7,0xdf
+
+# ATT:   shlxq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
+
+## shrx
+
+# ATT:   shrxl	%ecx, %edx, %r10d
+# INTEL: shrx	r10d, edx, ecx
+0x62,0x72,0x77,0x08,0xf7,0xd2
+
+# ATT:   shrxl	%ecx, 123(%rax,%rbx,4), %edx
+# INTEL: shrx	edx, dword ptr [rax + 4*rbx + 123], ecx
+0x62,0xf2,0x77,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00
+
+# ATT:   shrxq	%r9, %r15, %r11
+# INTEL: shrx	r11, r15, r9
+0x62,0x52,0xb7,0x08,0xf7,0xdf
+
+# ATT:   shrxq	%r9, 123(%rax,%rbx,4), %r15
+# INTEL: shrx	r15, qword ptr [rax + 4*rbx + 123], r9
+0x62,0x72,0xb7,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00
+
+# ATT:   shrxl	%r18d, %r22d, %r26d
+# INTEL: shrx	r26d, r22d, r18d
+0x62,0x6a,0x6f,0x00,0xf7,0xd6
+
+# ATT:   shrxl	%r18d, 291(%r28,%r29,4), %r22d
+# INTEL: shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   shrxq	%r19, %r23, %r27
+# INTEL: shrx	r27, r23, r19
+0x62,0x6a,0xe7,0x00,0xf7,0xdf
+
+# ATT:   shrxq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/cet.txt b/llvm/test/MC/Disassembler/X86/apx/cet.txt
new file mode 100644
index 0000000000000..867571b82c06e
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/cet.txt
@@ -0,0 +1,42 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+## wrssd
+
+# ATT:   wrssd	%ecx, 123(%rax,%rbx,4)
+# INTEL: wrssd	dword ptr [rax + 4*rbx + 123], ecx
+0x62,0xf4,0x7c,0x08,0x66,0x4c,0x98,0x7b
+
+# ATT:   wrssd	%r18d, 291(%r28,%r29,4)
+# INTEL: wrssd	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00
+
+## wrssq
+
+# ATT:   wrssq	%r9, 123(%rax,%rbx,4)
+# INTEL: wrssq	qword ptr [rax + 4*rbx + 123], r9
+0x62,0x74,0xfc,0x08,0x66,0x4c,0x98,0x7b
+
+# ATT:   wrssq	%r19, 291(%r28,%r29,4)
+# INTEL: wrssq	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00
+
+## wrussd
+
+# ATT:   wrussd	%ecx, 123(%rax,%rbx,4)
+# INTEL: wrussd	dword ptr [rax + 4*rbx + 123], ecx
+0x62,0xf4,0x7d,0x08,0x65,0x4c,0x98,0x7b
+
+# ATT:   wrussd	%r18d, 291(%r28,%r29,4)
+# INTEL: wrussd	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00
+
+## wrussq
+
+# ATT:   wrussq	%r9, 123(%rax,%rbx,4)
+# INTEL: wrussq	qword ptr [rax + 4*rbx + 123], r9
+0x62,0x74,0xfd,0x08,0x65,0x4c,0x98,0x7b
+
+# ATT:   wrussq	%r19, 291(%r28,%r29,4)
+# INTEL: wrussq	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
index 9f65d4c8d25ce..7a2e09af5b3db 100644
--- a/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
@@ -1,6 +1,30 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
 
+# ATT:   cmpaxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpaxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b
+
+# ATT:   cmpaxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpaxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b
+
+# ATT:   cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpbexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpbexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe6,0x54,0x98,0x7b
+
+# ATT:   cmpbexadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpbexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe6,0x7c,0x98,0x7b
+
 # ATT:   cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00
@@ -9,6 +33,14 @@
 # INTEL: cmpbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00
 
+# ATT:   cmpbxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpbxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe2,0x54,0x98,0x7b
+
+# ATT:   cmpbxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpbxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe2,0x7c,0x98,0x7b
+
 # ATT:   cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00
@@ -17,6 +49,62 @@
 # INTEL: cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00
 
+# ATT:   cmpexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b
+
+# ATT:   cmpexadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b
+
+# ATT:   cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpgexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b
+
+# ATT:   cmpgexadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpgexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b
+
+# ATT:   cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpgxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b
+
+# ATT:   cmpgxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpgxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b
+
+# ATT:   cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmplexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xee,0x54,0x98,0x7b
+
+# ATT:   cmplexadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmplexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xee,0x7c,0x98,0x7b
+
 # ATT:   cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00
@@ -25,6 +113,14 @@
 # INTEL: cmplexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00
 
+# ATT:   cmplxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmplxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xec,0x54,0x98,0x7b
+
+# ATT:   cmplxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmplxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xec,0x7c,0x98,0x7b
+
 # ATT:   cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00
@@ -33,29 +129,29 @@
 # INTEL: cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00
+# ATT:   cmpnexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpnexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b
 
-# ATT:   cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00
+# ATT:   cmpnexadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpnexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b
 
-# ATT:   cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00
+# ATT:   cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00
+# ATT:   cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00
+# ATT:   cmpnoxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpnoxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe1,0x54,0x98,0x7b
 
-# ATT:   cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00
+# ATT:   cmpnoxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpnoxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe1,0x7c,0x98,0x7b
 
 # ATT:   cmpnoxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmpnoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -65,6 +161,14 @@
 # INTEL: cmpnoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00
 
+# ATT:   cmpnpxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpnpxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xeb,0x54,0x98,0x7b
+
+# ATT:   cmpnpxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpnpxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xeb,0x7c,0x98,0x7b
+
 # ATT:   cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00
@@ -73,6 +177,14 @@
 # INTEL: cmpnpxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00
 
+# ATT:   cmpnsxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpnsxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe9,0x54,0x98,0x7b
+
+# ATT:   cmpnsxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpnsxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe9,0x7c,0x98,0x7b
+
 # ATT:   cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00
@@ -81,13 +193,13 @@
 # INTEL: cmpnsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00
+# ATT:   cmpoxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpoxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe0,0x54,0x98,0x7b
 
-# ATT:   cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00
+# ATT:   cmpoxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpoxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe0,0x7c,0x98,0x7b
 
 # ATT:   cmpoxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmpoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -97,6 +209,14 @@
 # INTEL: cmpoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00
 
+# ATT:   cmppxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmppxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xea,0x54,0x98,0x7b
+
+# ATT:   cmppxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmppxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xea,0x7c,0x98,0x7b
+
 # ATT:   cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00
@@ -105,6 +225,14 @@
 # INTEL: cmppxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00
 
+# ATT:   cmpsxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpsxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+0x62,0xf2,0x75,0x08,0xe8,0x54,0x98,0x7b
+
+# ATT:   cmpsxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpsxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+0x62,0x72,0xb5,0x08,0xe8,0x7c,0x98,0x7b
+
 # ATT:   cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # INTEL: cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00
@@ -112,11 +240,3 @@
 # ATT:   cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
 # INTEL: cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00
-
-# ATT:   cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00
-
-# ATT:   cmpexadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/invept.txt b/llvm/test/MC/Disassembler/X86/apx/invept.txt
deleted file mode 100644
index dc6bcbbb05cca..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/invept.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   invept	123(%r28,%r29,4), %r19
-# INTEL: invept	r19, xmmword ptr [r28 + 4*r29 + 123]
-0x62,0x8c,0x7a,0x08,0xf0,0x5c,0xac,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/invpcid.txt b/llvm/test/MC/Disassembler/X86/apx/invpcid.txt
index 8987332e9c9d9..8b590cc4ea47c 100644
--- a/llvm/test/MC/Disassembler/X86/apx/invpcid.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/invpcid.txt
@@ -1,6 +1,10 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
 
+# ATT:   invpcid	123(%rax,%rbx,4), %r9
+# INTEL: invpcid	r9, xmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7e,0x08,0xf2,0x4c,0x98,0x7b
+
 # ATT:   invpcid	291(%r28,%r29,4), %r19
 # INTEL: invpcid	r19, xmmword ptr [r28 + 4*r29 + 291]
 0x62,0x8c,0x7a,0x08,0xf2,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/invvpid.txt b/llvm/test/MC/Disassembler/X86/apx/invvpid.txt
deleted file mode 100644
index 05abc29b9b467..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/invvpid.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   invvpid	291(%r28,%r29,4), %r19
-# INTEL: invvpid	r19, xmmword ptr [r28 + 4*r29 + 291]
-0x62,0x8c,0x7a,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/movdir64b.txt b/llvm/test/MC/Disassembler/X86/apx/movdir64b.txt
index 81d8f49dbf69d..efb2f41056e32 100644
--- a/llvm/test/MC/Disassembler/X86/apx/movdir64b.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/movdir64b.txt
@@ -1,6 +1,14 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
 
+# ATT:   movdir64b	123(%eax,%ebx,4), %ecx
+# INTEL: movdir64b	ecx, zmmword ptr [eax + 4*ebx + 123]
+0x67,0x62,0xf4,0x7d,0x08,0xf8,0x4c,0x98,0x7b
+
+# ATT:   movdir64b	123(%rax,%rbx,4), %r9
+# INTEL: movdir64b	r9, zmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7d,0x08,0xf8,0x4c,0x98,0x7b
+
 # ATT:   movdir64b	291(%r28d,%r29d,4), %r18d
 # INTEL: movdir64b	r18d, zmmword ptr [r28d + 4*r29d + 291]
 0x67,0x62,0x8c,0x79,0x08,0xf8,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/movdiri.txt b/llvm/test/MC/Disassembler/X86/apx/movdiri.txt
index 997d016f0d222..280d01e1cd0bb 100644
--- a/llvm/test/MC/Disassembler/X86/apx/movdiri.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/movdiri.txt
@@ -1,6 +1,14 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
 
+# ATT:   movdiri	%ecx, 123(%eax,%ebx,4)
+# INTEL: movdiri	dword ptr [eax + 4*ebx + 123], ecx
+0x67,0x62,0xf4,0x7c,0x08,0xf9,0x4c,0x98,0x7b
+
+# ATT:   movdiri	%r9, 123(%rax,%rbx,4)
+# INTEL: movdiri	qword ptr [rax + 4*rbx + 123], r9
+0x62,0x74,0xfc,0x08,0xf9,0x4c,0x98,0x7b
+
 # ATT:   movdiri	%r18d, 291(%r28,%r29,4)
 # INTEL: movdiri	dword ptr [r28 + 4*r29 + 291], r18d
 0x62,0x8c,0x78,0x08,0xf9,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/mulx.txt b/llvm/test/MC/Disassembler/X86/apx/mulx.txt
deleted file mode 100644
index 5d9b53b99a71b..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/mulx.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   mulxl	%r18d, %r22d, %r26d
-# INTEL: mulx	r26d, r22d, r18d
-0x62,0x6a,0x4f,0x00,0xf6,0xd2
-
-# ATT:   mulxq	%r19, %r23, %r27
-# INTEL: mulx	r27, r23, r19
-0x62,0x6a,0xc7,0x00,0xf6,0xdb
-
-# ATT:   mulxl	291(%r28,%r29,4), %r18d, %r22d
-# INTEL: mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
-0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00
-
-# ATT:   mulxq	291(%r28,%r29,4), %r19, %r23
-# INTEL: mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
-0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/rorx.txt b/llvm/test/MC/Disassembler/X86/apx/rorx.txt
deleted file mode 100644
index 9860deaea86bd..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/rorx.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   rorxl	$123, %r18d, %r22d
-# INTEL: rorx	r22d, r18d, 123
-0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b
-
-# ATT:   rorxq	$123, %r19, %r23
-# INTEL: rorx	r23, r19, 123
-0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b
-
-# ATT:   rorxl	$123, 291(%r28,%r29,4), %r18d
-# INTEL: rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
-0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b
-
-# ATT:   rorxq	$123, 291(%r28,%r29,4), %r19
-# INTEL: rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
-0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/sarx.txt b/llvm/test/MC/Disassembler/X86/apx/sarx.txt
deleted file mode 100644
index 20018f4d4b128..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/sarx.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   sarxl	%r18d, %r22d, %r26d
-# INTEL: sarx	r26d, r22d, r18d
-0x62,0x6a,0x6e,0x00,0xf7,0xd6
-
-# ATT:   sarxl	%r18d, 291(%r28,%r29,4), %r22d
-# INTEL: sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
-
-# ATT:   sarxq	%r19, %r23, %r27
-# INTEL: sarx	r27, r23, r19
-0x62,0x6a,0xe6,0x00,0xf7,0xdf
-
-# ATT:   sarxq	%r19, 291(%r28,%r29,4), %r23
-# INTEL: sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
-0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sha.txt b/llvm/test/MC/Disassembler/X86/apx/sha.txt
new file mode 100644
index 0000000000000..6385ba754aba4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/sha.txt
@@ -0,0 +1,124 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+## sha1msg1
+
+# ATT:   sha1msg1	%xmm13, %xmm12
+# INTEL: sha1msg1	xmm12, xmm13
+0x62,0x54,0x7c,0x08,0xd9,0xe5
+
+# ATT:   sha1msg1	123(%rax,%rbx,4), %xmm12
+# INTEL: sha1msg1	xmm12, xmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7c,0x08,0xd9,0x64,0x98,0x7b
+
+# ATT:   sha1msg1	%xmm13, %xmm12
+# INTEL: sha1msg1	xmm12, xmm13
+0x45,0x0f,0x38,0xc9,0xe5
+
+# ATT:   sha1msg1	291(%r28,%r29,4), %xmm12
+# INTEL: sha1msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+0x62,0x1c,0x78,0x08,0xd9,0xa4,0xac,0x23,0x01,0x00,0x00
+
+## sha1msg2
+
+# ATT:   sha1msg2	%xmm13, %xmm12
+# INTEL: sha1msg2	xmm12, xmm13
+0x62,0x54,0x7c,0x08,0xda,0xe5
+
+# ATT:   sha1msg2	123(%rax,%rbx,4), %xmm12
+# INTEL: sha1msg2	xmm12, xmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7c,0x08,0xda,0x64,0x98,0x7b
+
+# ATT:   sha1msg2	%xmm13, %xmm12
+# INTEL: sha1msg2	xmm12, xmm13
+0x45,0x0f,0x38,0xca,0xe5
+
+# ATT:   sha1msg2	291(%r28,%r29,4), %xmm12
+# INTEL: sha1msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+0x62,0x1c,0x78,0x08,0xda,0xa4,0xac,0x23,0x01,0x00,0x00
+
+## sha1nexte
+
+# ATT:   sha1nexte	%xmm13, %xmm12
+# INTEL: sha1nexte	xmm12, xmm13
+0x62,0x54,0x7c,0x08,0xd8,0xe5
+
+# ATT:   sha1nexte	123(%rax,%rbx,4), %xmm12
+# INTEL: sha1nexte	xmm12, xmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7c,0x08,0xd8,0x64,0x98,0x7b
+
+# ATT:   sha1nexte	%xmm13, %xmm12
+# INTEL: sha1nexte	xmm12, xmm13
+0x45,0x0f,0x38,0xc8,0xe5
+
+# ATT:   sha1nexte	291(%r28,%r29,4), %xmm12
+# INTEL: sha1nexte	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+0x62,0x1c,0x78,0x08,0xd8,0xa4,0xac,0x23,0x01,0x00,0x00
+
+## sha1rnds4
+
+# ATT:   sha1rnds4	$123, %xmm13, %xmm12
+# INTEL: sha1rnds4	xmm12, xmm13, 123
+0x62,0x54,0x7c,0x08,0xd4,0xe5,0x7b
+
+# ATT:   sha1rnds4	$123, 123(%rax,%rbx,4), %xmm12
+# INTEL: sha1rnds4	xmm12, xmmword ptr [rax + 4*rbx + 123], 123
+0x62,0x74,0x7c,0x08,0xd4,0x64,0x98,0x7b,0x7b
+
+# ATT:   sha1rnds4	$123, %xmm13, %xmm12
+# INTEL: sha1rnds4	xmm12, xmm13, 123
+0x45,0x0f,0x3a,0xcc,0xe5,0x7b
+
+# ATT:   sha1rnds4	$123, 291(%r28,%r29,4), %xmm12
+# INTEL: sha1rnds4	xmm12, xmmword ptr [r28 + 4*r29 + 291], 123
+0x62,0x1c,0x78,0x08,0xd4,0xa4,0xac,0x23,0x01,0x00,0x00,0x7b
+
+## sha256msg1
+
+# ATT:   sha256msg1	%xmm13, %xmm12
+# INTEL: sha256msg1	xmm12, xmm13
+0x62,0x54,0x7c,0x08,0xdc,0xe5
+
+# ATT:   sha256msg1	123(%rax,%rbx,4), %xmm12
+# INTEL: sha256msg1	xmm12, xmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7c,0x08,0xdc,0x64,0x98,0x7b
+
+# ATT:   sha256msg1	%xmm13, %xmm12
+# INTEL: sha256msg1	xmm12, xmm13
+0x45,0x0f,0x38,0xcc,0xe5
+
+# ATT:   sha256msg1	291(%r28,%r29,4), %xmm12
+# INTEL: sha256msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+0x62,0x1c,0x78,0x08,0xdc,0xa4,0xac,0x23,0x01,0x00,0x00
+
+## sha256msg2
+
+# ATT:   sha256msg2	%xmm13, %xmm12
+# INTEL: sha256msg2	xmm12, xmm13
+0x62,0x54,0x7c,0x08,0xdd,0xe5
+
+# ATT:   sha256msg2	123(%rax,%rbx,4), %xmm12
+# INTEL: sha256msg2	xmm12, xmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7c,0x08,0xdd,0x64,0x98,0x7b
+
+# ATT:   sha256msg2	%xmm13, %xmm12
+# INTEL: sha256msg2	xmm12, xmm13
+0x45,0x0f,0x38,0xcd,0xe5
+
+# ATT:   sha256msg2	291(%r28,%r29,4), %xmm12
+# INTEL: sha256msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+0x62,0x1c,0x78,0x08,0xdd,0xa4,0xac,0x23,0x01,0x00,0x00
+
+## sha256rnds2
+
+# ATT:   sha256rnds2	%xmm0, 123(%rax,%rbx,4), %xmm12
+# INTEL: sha256rnds2	xmm12, xmmword ptr [rax + 4*rbx + 123], xmm0
+0x62,0x74,0x7c,0x08,0xdb,0x64,0x98,0x7b
+
+# ATT:   sha256rnds2	%xmm0, %xmm13, %xmm12
+# INTEL: sha256rnds2	xmm12, xmm13, xmm0
+0x45,0x0f,0x38,0xcb,0xe5
+
+# ATT:   sha256rnds2	%xmm0, 291(%r28,%r29,4), %xmm12
+# INTEL: sha256rnds2	xmm12, xmmword ptr [r28 + 4*r29 + 291], xmm0
+0x62,0x1c,0x78,0x08,0xdb,0xa4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sha1msg1.txt b/llvm/test/MC/Disassembler/X86/apx/sha1msg1.txt
deleted file mode 100644
index 1c94fa88a3d3c..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/sha1msg1.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   sha1msg1	%xmm13, %xmm12
-# INTEL: sha1msg1	xmm12, xmm13
-0x45,0x0f,0x38,0xc9,0xe5
-
-# ATT:   sha1msg1	291(%r28,%r29,4), %xmm12
-# INTEL: sha1msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-0x62,0x1c,0x78,0x08,0xd9,0xa4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sha1msg2.txt b/llvm/test/MC/Disassembler/X86/apx/sha1msg2.txt
deleted file mode 100644
index 5fd17d9f32600..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/sha1msg2.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   sha1msg2	%xmm13, %xmm12
-# INTEL: sha1msg2	xmm12, xmm13
-0x45,0x0f,0x38,0xca,0xe5
-
-# ATT:   sha1msg2	291(%r28,%r29,4), %xmm12
-# INTEL: sha1msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-0x62,0x1c,0x78,0x08,0xda,0xa4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sha1nexte.txt b/llvm/test/MC/Disassembler/X86/apx/sha1nexte.txt
deleted file mode 100644
index 3c5eae3d7177f..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/sha1nexte.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   sha1nexte	%xmm13, %xmm12
-# INTEL: sha1nexte	xmm12, xmm13
-0x45,0x0f,0x38,0xc8,0xe5
-
-# ATT:   sha1nexte	291(%r28,%r29,4), %xmm12
-# INTEL: sha1nexte	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-0x62,0x1c,0x78,0x08,0xd8,0xa4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sha1rnds4.txt b/llvm/test/MC/Disassembler/X86/apx/sha1rnds4.txt
deleted file mode 100644
index a05f17739606a..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/sha1rnds4.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   sha1rnds4	$123, %xmm13, %xmm12
-# INTEL: sha1rnds4	xmm12, xmm13, 123
-0x45,0x0f,0x3a,0xcc,0xe5,0x7b
-
-# ATT:   sha1rnds4	$123, 291(%r28,%r29,4), %xmm12
-# INTEL: sha1rnds4	xmm12, xmmword ptr [r28 + 4*r29 + 291], 123
-0x62,0x1c,0x78,0x08,0xd4,0xa4,0xac,0x23,0x01,0x00,0x00,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/sha256msg1.txt b/llvm/test/MC/Disassembler/X86/apx/sha256msg1.txt
deleted file mode 100644
index b4c14866647dd..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/sha256msg1.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   sha256msg1	%xmm13, %xmm12
-# INTEL: sha256msg1	xmm12, xmm13
-0x45,0x0f,0x38,0xcc,0xe5
-
-# ATT:   sha256msg1	291(%r28,%r29,4), %xmm12
-# INTEL: sha256msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-0x62,0x1c,0x78,0x08,0xdc,0xa4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sha256msg2.txt b/llvm/test/MC/Disassembler/X86/apx/sha256msg2.txt
deleted file mode 100644
index 75099b428e2b6..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/sha256msg2.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   sha256msg2	%xmm13, %xmm12
-# INTEL: sha256msg2	xmm12, xmm13
-0x45,0x0f,0x38,0xcd,0xe5
-
-# ATT:   sha256msg2	291(%r28,%r29,4), %xmm12
-# INTEL: sha256msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-0x62,0x1c,0x78,0x08,0xdd,0xa4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sha256rnds2.txt b/llvm/test/MC/Disassembler/X86/apx/sha256rnds2.txt
deleted file mode 100644
index 1ca60aa9e9b1a..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/sha256rnds2.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   sha256rnds2	%xmm0, %xmm13, %xmm12
-# INTEL: sha256rnds2	xmm12, xmm13, xmm0
-0x45,0x0f,0x38,0xcb,0xe5
-
-# ATT:   sha256rnds2	%xmm0, 291(%r28,%r29,4), %xmm12
-# INTEL: sha256rnds2	xmm12, xmmword ptr [r28 + 4*r29 + 291], xmm0
-0x62,0x1c,0x78,0x08,0xdb,0xa4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/shlx.txt b/llvm/test/MC/Disassembler/X86/apx/shlx.txt
deleted file mode 100644
index f6d6250bd0631..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/shlx.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   shlxl	%r18d, %r22d, %r26d
-# INTEL: shlx	r26d, r22d, r18d
-0x62,0x6a,0x6d,0x00,0xf7,0xd6
-
-# ATT:   shlxl	%r18d, 291(%r28,%r29,4), %r22d
-# INTEL: shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
-
-# ATT:   shlxq	%r19, %r23, %r27
-# INTEL: shlx	r27, r23, r19
-0x62,0x6a,0xe5,0x00,0xf7,0xdf
-
-# ATT:   shlxq	%r19, 291(%r28,%r29,4), %r23
-# INTEL: shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
-0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/shrx.txt b/llvm/test/MC/Disassembler/X86/apx/shrx.txt
deleted file mode 100644
index 09750e05c127e..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/shrx.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   shrxl	%r18d, %r22d, %r26d
-# INTEL: shrx	r26d, r22d, r18d
-0x62,0x6a,0x6f,0x00,0xf7,0xd6
-
-# ATT:   shrxl	%r18d, 291(%r28,%r29,4), %r22d
-# INTEL: shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
-
-# ATT:   shrxq	%r19, %r23, %r27
-# INTEL: shrx	r27, r23, r19
-0x62,0x6a,0xe7,0x00,0xf7,0xdf
-
-# ATT:   shrxq	%r19, 291(%r28,%r29,4), %r23
-# INTEL: shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
-0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/vmx.txt b/llvm/test/MC/Disassembler/X86/apx/vmx.txt
new file mode 100644
index 0000000000000..814830d38a7c7
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/vmx.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+## invept
+
+# ATT:   invept	291(%r28,%r29,4), %r19
+# INTEL: invept	r19, xmmword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x7a,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   invept	123(%rax,%rbx,4), %r9
+# INTEL: invept	r9, xmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7e,0x08,0xf0,0x4c,0x98,0x7b
+
+## invvpid
+
+# ATT:   invvpid	291(%r28,%r29,4), %r19
+# INTEL: invvpid	r19, xmmword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x7a,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   invvpid	123(%rax,%rbx,4), %r9
+# INTEL: invvpid	r9, xmmword ptr [rax + 4*rbx + 123]
+0x62,0x74,0x7e,0x08,0xf1,0x4c,0x98,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrssd.txt b/llvm/test/MC/Disassembler/X86/apx/wrssd.txt
deleted file mode 100644
index 600e85e1440e8..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/wrssd.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   wrssd	%r18d, 291(%r28,%r29,4)
-# INTEL: wrssd	dword ptr [r28 + 4*r29 + 291], r18d
-0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrssq.txt b/llvm/test/MC/Disassembler/X86/apx/wrssq.txt
deleted file mode 100644
index 9f5b26321fd2b..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/wrssq.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   wrssq	%r19, 291(%r28,%r29,4)
-# INTEL: wrssq	qword ptr [r28 + 4*r29 + 291], r19
-0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrussd.txt b/llvm/test/MC/Disassembler/X86/apx/wrussd.txt
deleted file mode 100644
index 1b8b0007e2d32..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/wrussd.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   wrussd	%r18d, 291(%r28,%r29,4)
-# INTEL: wrussd	dword ptr [r28 + 4*r29 + 291], r18d
-0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrussq.txt b/llvm/test/MC/Disassembler/X86/apx/wrussq.txt
deleted file mode 100644
index 7ff51f617c5cc..0000000000000
--- a/llvm/test/MC/Disassembler/X86/apx/wrussq.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   wrussq	%r19, 291(%r28,%r29,4)
-# INTEL: wrussq	qword ptr [r28 + 4*r29 + 291], r19
-0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/X86/apx/amx-tile-att.s b/llvm/test/MC/X86/apx/amx-tile-att.s
index f4a47c16d1939..427e30a41ea12 100644
--- a/llvm/test/MC/X86/apx/amx-tile-att.s
+++ b/llvm/test/MC/X86/apx/amx-tile-att.s
@@ -1,24 +1,55 @@
 # RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-5: error:
+# ERROR-COUNT-10: error:
 # ERROR-NOT: error:
+
+## ldtilecfg
+
+# CHECK: {evex}	ldtilecfg	123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x49,0x44,0x98,0x7b]
+         {evex}	ldtilecfg	123(%rax,%rbx,4)
+
 # CHECK: ldtilecfg	291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
          ldtilecfg	291(%r28,%r29,4)
 
+## sttilecfg
+
+# CHECK: {evex}	sttilecfg	123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x49,0x44,0x98,0x7b]
+         {evex}	sttilecfg	123(%rax,%rbx,4)
+
 # CHECK: sttilecfg	291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
          sttilecfg	291(%r28,%r29,4)
 
+## tileloadd
+
+# CHECK: {evex}	tileloadd	123(%rax,%rbx,4), %tmm6
+# CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4b,0x74,0x98,0x7b]
+         {evex}	tileloadd	123(%rax,%rbx,4), %tmm6
+
 # CHECK: tileloadd	291(%r28,%r29,4), %tmm6
 # CHECK: encoding: [0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
          tileloadd	291(%r28,%r29,4), %tmm6
 
+## tileloaddt1
+
+# CHECK: {evex}	tileloaddt1	123(%rax,%rbx,4), %tmm6
+# CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4b,0x74,0x98,0x7b]
+         {evex}	tileloaddt1	123(%rax,%rbx,4), %tmm6
+
 # CHECK: tileloaddt1	291(%r28,%r29,4), %tmm6
 # CHECK: encoding: [0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
          tileloaddt1	291(%r28,%r29,4), %tmm6
 
+## tilestored
+
+# CHECK: {evex}	tilestored	%tmm6, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x4b,0x74,0x98,0x7b]
+         {evex}	tilestored	%tmm6, 123(%rax,%rbx,4)
+
 # CHECK: tilestored	%tmm6, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
          tilestored	%tmm6, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/amx-tile-intel.s b/llvm/test/MC/X86/apx/amx-tile-intel.s
index dd7b87b1806c2..a2a1b31b9d9c9 100644
--- a/llvm/test/MC/X86/apx/amx-tile-intel.s
+++ b/llvm/test/MC/X86/apx/amx-tile-intel.s
@@ -1,21 +1,51 @@
 # RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
+## ldtilecfg
+
+# CHECK: {evex}	ldtilecfg	[rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x49,0x44,0x98,0x7b]
+         {evex}	ldtilecfg	[rax + 4*rbx + 123]
+
 # CHECK: ldtilecfg	[r28 + 4*r29 + 291]
 # CHECK: encoding: [0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
          ldtilecfg	[r28 + 4*r29 + 291]
 
+## sttilecfg
+
+# CHECK: {evex}	sttilecfg	[rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x49,0x44,0x98,0x7b]
+         {evex}	sttilecfg	[rax + 4*rbx + 123]
+
 # CHECK: sttilecfg	[r28 + 4*r29 + 291]
 # CHECK: encoding: [0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
          sttilecfg	[r28 + 4*r29 + 291]
 
+## tileloadd
+
+# CHECK: {evex}	tileloadd	tmm6, [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4b,0x74,0x98,0x7b]
+         {evex}	tileloadd	tmm6, [rax + 4*rbx + 123]
+
 # CHECK: tileloadd	tmm6, [r28 + 4*r29 + 291]
 # CHECK: encoding: [0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
          tileloadd	tmm6, [r28 + 4*r29 + 291]
 
+## tileloaddt1
+
+# CHECK: {evex}	tileloaddt1	tmm6, [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4b,0x74,0x98,0x7b]
+         {evex}	tileloaddt1	tmm6, [rax + 4*rbx + 123]
+
 # CHECK: tileloaddt1	tmm6, [r28 + 4*r29 + 291]
 # CHECK: encoding: [0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
          tileloaddt1	tmm6, [r28 + 4*r29 + 291]
 
+## tilestored
+
+# CHECK: {evex}	tilestored	[rax + 4*rbx + 123], tmm6
+# CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x4b,0x74,0x98,0x7b]
+         {evex}	tilestored	[rax + 4*rbx + 123], tmm6
+
 # CHECK: tilestored	[r28 + 4*r29 + 291], tmm6
 # CHECK: encoding: [0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
          tilestored	[r28 + 4*r29 + 291], tmm6
diff --git a/llvm/test/MC/X86/apx/bmi2-att.s b/llvm/test/MC/X86/apx/bmi2-att.s
new file mode 100644
index 0000000000000..14e8566e799d5
--- /dev/null
+++ b/llvm/test/MC/X86/apx/bmi2-att.s
@@ -0,0 +1,243 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-56: error:
+# ERROR-NOT: error:
+
+## mulx
+
+# CHECK: {evex}	mulxl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0x72,0x6f,0x08,0xf6,0xd1]
+         {evex}	mulxl	%ecx, %edx, %r10d
+
+# CHECK: {evex}	mulxq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x52,0x87,0x08,0xf6,0xd9]
+         {evex}	mulxq	%r9, %r15, %r11
+
+# CHECK: {evex}	mulxl	123(%rax,%rbx,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf6,0x94,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	mulxl	123(%rax,%rbx,4), %ecx, %edx
+
+# CHECK: {evex}	mulxq	123(%rax,%rbx,4), %r9, %r15
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf6,0xbc,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	mulxq	123(%rax,%rbx,4), %r9, %r15
+
+# CHECK: mulxl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf6,0xd2]
+         mulxl	%r18d, %r22d, %r26d
+
+# CHECK: mulxq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf6,0xdb]
+         mulxq	%r19, %r23, %r27
+
+# CHECK: mulxl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         mulxl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: mulxq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00]
+         mulxq	291(%r28,%r29,4), %r19, %r23
+
+## pdep
+
+# CHECK: {evex}	pdepl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0x72,0x6f,0x08,0xf5,0xd1]
+         {evex}	pdepl	%ecx, %edx, %r10d
+
+# CHECK: {evex}	pdepq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x52,0x87,0x08,0xf5,0xd9]
+         {evex}	pdepq	%r9, %r15, %r11
+
+# CHECK: {evex}	pdepl	123(%rax,%rbx,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf5,0x54,0x98,0x7b]
+         {evex}	pdepl	123(%rax,%rbx,4), %ecx, %edx
+
+# CHECK: {evex}	pdepq	123(%rax,%rbx,4), %r9, %r15
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf5,0x7c,0x98,0x7b]
+         {evex}	pdepq	123(%rax,%rbx,4), %r9, %r15
+
+# CHECK: pdepl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf5,0xd2]
+         pdepl	%r18d, %r22d, %r26d
+
+# CHECK: pdepq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf5,0xdb]
+         pdepq	%r19, %r23, %r27
+
+# CHECK: pdepl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         pdepl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: pdepq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         pdepq	291(%r28,%r29,4), %r19, %r23
+
+## pext
+
+# CHECK: {evex}	pextl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0x72,0x6e,0x08,0xf5,0xd1]
+         {evex}	pextl	%ecx, %edx, %r10d
+
+# CHECK: {evex}	pextq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x52,0x86,0x08,0xf5,0xd9]
+         {evex}	pextq	%r9, %r15, %r11
+
+# CHECK: {evex}	pextl	123(%rax,%rbx,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xf2,0x76,0x08,0xf5,0x54,0x98,0x7b]
+         {evex}	pextl	123(%rax,%rbx,4), %ecx, %edx
+
+# CHECK: {evex}	pextq	123(%rax,%rbx,4), %r9, %r15
+# CHECK: encoding: [0x62,0x72,0xb6,0x08,0xf5,0x7c,0x98,0x7b]
+         {evex}	pextq	123(%rax,%rbx,4), %r9, %r15
+
+# CHECK: pextl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x4e,0x00,0xf5,0xd2]
+         pextl	%r18d, %r22d, %r26d
+
+# CHECK: pextq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xc6,0x00,0xf5,0xdb]
+         pextq	%r19, %r23, %r27
+
+# CHECK: pextl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         pextl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: pextq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         pextq	291(%r28,%r29,4), %r19, %r23
+
+## rorx
+
+# CHECK: {evex}	rorxl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf3,0x7f,0x08,0xf0,0xd1,0x7b]
+         {evex}	rorxl	$123, %ecx, %edx
+
+# CHECK: {evex}	rorxq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0x53,0xff,0x08,0xf0,0xf9,0x7b]
+         {evex}	rorxq	$123, %r9, %r15
+
+# CHECK: {evex}	rorxl	$123, 123(%rax,%rbx,4), %ecx
+# CHECK: encoding: [0x62,0xf3,0x7f,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b]
+         {evex}	rorxl	$123, 123(%rax,%rbx,4), %ecx
+
+# CHECK: {evex}	rorxq	$123, 123(%rax,%rbx,4), %r9
+# CHECK: encoding: [0x62,0x73,0xff,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b]
+         {evex}	rorxq	$123, 123(%rax,%rbx,4), %r9
+
+# CHECK: rorxl	$123, %r18d, %r22d
+# CHECK: encoding: [0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b]
+         rorxl	$123, %r18d, %r22d
+
+# CHECK: rorxq	$123, %r19, %r23
+# CHECK: encoding: [0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b]
+         rorxq	$123, %r19, %r23
+
+# CHECK: rorxl	$123, 291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b]
+         rorxl	$123, 291(%r28,%r29,4), %r18d
+
+# CHECK: rorxq	$123, 291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b]
+         rorxq	$123, 291(%r28,%r29,4), %r19
+
+## sarx
+
+# CHECK: {evex}	sarxl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0x72,0x76,0x08,0xf7,0xd2]
+         {evex}	sarxl	%ecx, %edx, %r10d
+
+# CHECK: {evex}	sarxl	%ecx, 123(%rax,%rbx,4), %edx
+# CHECK: encoding: [0x62,0xf2,0x76,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	sarxl	%ecx, 123(%rax,%rbx,4), %edx
+
+# CHECK: {evex}	sarxq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x52,0xb6,0x08,0xf7,0xdf]
+         {evex}	sarxq	%r9, %r15, %r11
+
+# CHECK: {evex}	sarxq	%r9, 123(%rax,%rbx,4), %r15
+# CHECK: encoding: [0x62,0x72,0xb6,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	sarxq	%r9, 123(%rax,%rbx,4), %r15
+
+# CHECK: sarxl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x6e,0x00,0xf7,0xd6]
+         sarxl	%r18d, %r22d, %r26d
+
+# CHECK: sarxl	%r18d, 291(%r28,%r29,4), %r22d
+# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         sarxl	%r18d, 291(%r28,%r29,4), %r22d
+
+# CHECK: sarxq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe6,0x00,0xf7,0xdf]
+         sarxq	%r19, %r23, %r27
+
+# CHECK: sarxq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         sarxq	%r19, 291(%r28,%r29,4), %r23
+
+## shlx
+
+# CHECK: {evex}	shlxl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0x72,0x75,0x08,0xf7,0xd2]
+         {evex}	shlxl	%ecx, %edx, %r10d
+
+# CHECK: {evex}	shlxl	%ecx, 123(%rax,%rbx,4), %edx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	shlxl	%ecx, 123(%rax,%rbx,4), %edx
+
+# CHECK: {evex}	shlxq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x52,0xb5,0x08,0xf7,0xdf]
+         {evex}	shlxq	%r9, %r15, %r11
+
+# CHECK: {evex}	shlxq	%r9, 123(%rax,%rbx,4), %r15
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	shlxq	%r9, 123(%rax,%rbx,4), %r15
+
+# CHECK: shlxl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x6d,0x00,0xf7,0xd6]
+         shlxl	%r18d, %r22d, %r26d
+
+# CHECK: shlxl	%r18d, 291(%r28,%r29,4), %r22d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         shlxl	%r18d, 291(%r28,%r29,4), %r22d
+
+# CHECK: shlxq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe5,0x00,0xf7,0xdf]
+         shlxq	%r19, %r23, %r27
+
+# CHECK: shlxq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         shlxq	%r19, 291(%r28,%r29,4), %r23
+
+## shrx
+
+# CHECK: {evex}	shrxl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0x72,0x77,0x08,0xf7,0xd2]
+         {evex}	shrxl	%ecx, %edx, %r10d
+
+# CHECK: {evex}	shrxl	%ecx, 123(%rax,%rbx,4), %edx
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	shrxl	%ecx, 123(%rax,%rbx,4), %edx
+
+# CHECK: {evex}	shrxq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x52,0xb7,0x08,0xf7,0xdf]
+         {evex}	shrxq	%r9, %r15, %r11
+
+# CHECK: {evex}	shrxq	%r9, 123(%rax,%rbx,4), %r15
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	shrxq	%r9, 123(%rax,%rbx,4), %r15
+
+# CHECK: shrxl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x6f,0x00,0xf7,0xd6]
+         shrxl	%r18d, %r22d, %r26d
+
+# CHECK: shrxl	%r18d, 291(%r28,%r29,4), %r22d
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         shrxl	%r18d, 291(%r28,%r29,4), %r22d
+
+# CHECK: shrxq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe7,0x00,0xf7,0xdf]
+         shrxq	%r19, %r23, %r27
+
+# CHECK: shrxq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         shrxq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/bmi2-intel.s b/llvm/test/MC/X86/apx/bmi2-intel.s
new file mode 100644
index 0000000000000..f21004fdd696a
--- /dev/null
+++ b/llvm/test/MC/X86/apx/bmi2-intel.s
@@ -0,0 +1,239 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+## mulx
+
+# CHECK: {evex}	mulx	r10d, edx, ecx
+# CHECK: encoding: [0x62,0x72,0x6f,0x08,0xf6,0xd1]
+         {evex}	mulx	r10d, edx, ecx
+
+# CHECK: {evex}	mulx	r11, r15, r9
+# CHECK: encoding: [0x62,0x52,0x87,0x08,0xf6,0xd9]
+         {evex}	mulx	r11, r15, r9
+
+# CHECK: {evex}	mulx	edx, ecx, dword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf6,0x94,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	mulx	edx, ecx, dword ptr [rax + 4*rbx + 123]
+
+# CHECK: {evex}	mulx	r15, r9, qword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf6,0xbc,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	mulx	r15, r9, qword ptr [rax + 4*rbx + 123]
+
+# CHECK: mulx	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf6,0xd2]
+         mulx	r26d, r22d, r18d
+
+# CHECK: mulx	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf6,0xdb]
+         mulx	r27, r23, r19
+
+# CHECK: mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00]
+         mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+## pdep
+
+# CHECK: {evex}	pdep	r10d, edx, ecx
+# CHECK: encoding: [0x62,0x72,0x6f,0x08,0xf5,0xd1]
+         {evex}	pdep	r10d, edx, ecx
+
+# CHECK: {evex}	pdep	r11, r15, r9
+# CHECK: encoding: [0x62,0x52,0x87,0x08,0xf5,0xd9]
+         {evex}	pdep	r11, r15, r9
+
+# CHECK: {evex}	pdep	edx, ecx, dword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf5,0x54,0x98,0x7b]
+         {evex}	pdep	edx, ecx, dword ptr [rax + 4*rbx + 123]
+
+# CHECK: {evex}	pdep	r15, r9, qword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf5,0x7c,0x98,0x7b]
+         {evex}	pdep	r15, r9, qword ptr [rax + 4*rbx + 123]
+
+# CHECK: pdep	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf5,0xd2]
+         pdep	r26d, r22d, r18d
+
+# CHECK: pdep	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf5,0xdb]
+         pdep	r27, r23, r19
+
+# CHECK: pdep	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         pdep	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: pdep	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         pdep	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+## pext
+
+# CHECK: {evex}	pext	r10d, edx, ecx
+# CHECK: encoding: [0x62,0x72,0x6e,0x08,0xf5,0xd1]
+         {evex}	pext	r10d, edx, ecx
+
+# CHECK: {evex}	pext	r11, r15, r9
+# CHECK: encoding: [0x62,0x52,0x86,0x08,0xf5,0xd9]
+         {evex}	pext	r11, r15, r9
+
+# CHECK: {evex}	pext	edx, ecx, dword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0xf2,0x76,0x08,0xf5,0x54,0x98,0x7b]
+         {evex}	pext	edx, ecx, dword ptr [rax + 4*rbx + 123]
+
+# CHECK: {evex}	pext	r15, r9, qword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x72,0xb6,0x08,0xf5,0x7c,0x98,0x7b]
+         {evex}	pext	r15, r9, qword ptr [rax + 4*rbx + 123]
+
+# CHECK: pext	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x4e,0x00,0xf5,0xd2]
+         pext	r26d, r22d, r18d
+
+# CHECK: pext	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xc6,0x00,0xf5,0xdb]
+         pext	r27, r23, r19
+
+# CHECK: pext	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         pext	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: pext	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         pext	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+## rorx
+
+# CHECK: {evex}	rorx	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf3,0x7f,0x08,0xf0,0xd1,0x7b]
+         {evex}	rorx	edx, ecx, 123
+
+# CHECK: {evex}	rorx	r15, r9, 123
+# CHECK: encoding: [0x62,0x53,0xff,0x08,0xf0,0xf9,0x7b]
+         {evex}	rorx	r15, r9, 123
+
+# CHECK: {evex}	rorx	ecx, dword ptr [rax + 4*rbx + 123], 123
+# CHECK: encoding: [0x62,0xf3,0x7f,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b]
+         {evex}	rorx	ecx, dword ptr [rax + 4*rbx + 123], 123
+
+# CHECK: {evex}	rorx	r9, qword ptr [rax + 4*rbx + 123], 123
+# CHECK: encoding: [0x62,0x73,0xff,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b]
+         {evex}	rorx	r9, qword ptr [rax + 4*rbx + 123], 123
+
+# CHECK: rorx	r22d, r18d, 123
+# CHECK: encoding: [0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b]
+         rorx	r22d, r18d, 123
+
+# CHECK: rorx	r23, r19, 123
+# CHECK: encoding: [0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b]
+         rorx	r23, r19, 123
+
+# CHECK: rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
+# CHECK: encoding: [0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b]
+         rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
+
+# CHECK: rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
+# CHECK: encoding: [0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b]
+         rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
+
+## sarx
+
+# CHECK: {evex}	sarx	r10d, edx, ecx
+# CHECK: encoding: [0x62,0x72,0x76,0x08,0xf7,0xd2]
+         {evex}	sarx	r10d, edx, ecx
+
+# CHECK: {evex}	sarx	edx, dword ptr [rax + 4*rbx + 123], ecx
+# CHECK: encoding: [0x62,0xf2,0x76,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	sarx	edx, dword ptr [rax + 4*rbx + 123], ecx
+
+# CHECK: {evex}	sarx	r11, r15, r9
+# CHECK: encoding: [0x62,0x52,0xb6,0x08,0xf7,0xdf]
+         {evex}	sarx	r11, r15, r9
+
+# CHECK: {evex}	sarx	r15, qword ptr [rax + 4*rbx + 123], r9
+# CHECK: encoding: [0x62,0x72,0xb6,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	sarx	r15, qword ptr [rax + 4*rbx + 123], r9
+
+# CHECK: sarx	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x6e,0x00,0xf7,0xd6]
+         sarx	r26d, r22d, r18d
+
+# CHECK: sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: sarx	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe6,0x00,0xf7,0xdf]
+         sarx	r27, r23, r19
+
+# CHECK: sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
+
+## shlx
+
+# CHECK: {evex}	shlx	r10d, edx, ecx
+# CHECK: encoding: [0x62,0x72,0x75,0x08,0xf7,0xd2]
+         {evex}	shlx	r10d, edx, ecx
+
+# CHECK: {evex}	shlx	edx, dword ptr [rax + 4*rbx + 123], ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	shlx	edx, dword ptr [rax + 4*rbx + 123], ecx
+
+# CHECK: {evex}	shlx	r11, r15, r9
+# CHECK: encoding: [0x62,0x52,0xb5,0x08,0xf7,0xdf]
+         {evex}	shlx	r11, r15, r9
+
+# CHECK: {evex}	shlx	r15, qword ptr [rax + 4*rbx + 123], r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	shlx	r15, qword ptr [rax + 4*rbx + 123], r9
+
+# CHECK: shlx	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x6d,0x00,0xf7,0xd6]
+         shlx	r26d, r22d, r18d
+
+# CHECK: shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: shlx	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe5,0x00,0xf7,0xdf]
+         shlx	r27, r23, r19
+
+# CHECK: shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
+
+## shrx
+
+# CHECK: {evex}	shrx	r10d, edx, ecx
+# CHECK: encoding: [0x62,0x72,0x77,0x08,0xf7,0xd2]
+         {evex}	shrx	r10d, edx, ecx
+
+# CHECK: {evex}	shrx	edx, dword ptr [rax + 4*rbx + 123], ecx
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	shrx	edx, dword ptr [rax + 4*rbx + 123], ecx
+
+# CHECK: {evex}	shrx	r11, r15, r9
+# CHECK: encoding: [0x62,0x52,0xb7,0x08,0xf7,0xdf]
+         {evex}	shrx	r11, r15, r9
+
+# CHECK: {evex}	shrx	r15, qword ptr [rax + 4*rbx + 123], r9
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+         {evex}	shrx	r15, qword ptr [rax + 4*rbx + 123], r9
+
+# CHECK: shrx	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x6f,0x00,0xf7,0xd6]
+         shrx	r26d, r22d, r18d
+
+# CHECK: shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: shrx	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe7,0x00,0xf7,0xdf]
+         shrx	r27, r23, r19
+
+# CHECK: shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/cet-att.s b/llvm/test/MC/X86/apx/cet-att.s
new file mode 100644
index 0000000000000..ec8614856686b
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cet-att.s
@@ -0,0 +1,45 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-8: error:
+# ERROR-NOT: error:
+
+## wrssd
+
+# CHECK: {evex}	wrssd	%ecx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x66,0x4c,0x98,0x7b]
+         {evex}	wrssd	%ecx, 123(%rax,%rbx,4)
+
+# CHECK: wrssd	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrssd	%r18d, 291(%r28,%r29,4)
+
+## wrssq
+
+# CHECK: {evex}	wrssq	%r9, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x74,0xfc,0x08,0x66,0x4c,0x98,0x7b]
+         {evex}	wrssq	%r9, 123(%rax,%rbx,4)
+
+# CHECK: wrssq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrssq	%r19, 291(%r28,%r29,4)
+
+## wrussd
+
+# CHECK: {evex}	wrussd	%ecx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x65,0x4c,0x98,0x7b]
+         {evex}	wrussd	%ecx, 123(%rax,%rbx,4)
+
+# CHECK: wrussd	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrussd	%r18d, 291(%r28,%r29,4)
+
+## wrussq
+
+# CHECK: {evex}	wrussq	%r9, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x74,0xfd,0x08,0x65,0x4c,0x98,0x7b]
+         {evex}	wrussq	%r9, 123(%rax,%rbx,4)
+
+# CHECK: wrussq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrussq	%r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/cet-intel.s b/llvm/test/MC/X86/apx/cet-intel.s
new file mode 100644
index 0000000000000..e3d1427cd214a
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cet-intel.s
@@ -0,0 +1,41 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+## wrssd
+
+# CHECK: {evex}	wrssd	dword ptr [rax + 4*rbx + 123], ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x66,0x4c,0x98,0x7b]
+         {evex}	wrssd	dword ptr [rax + 4*rbx + 123], ecx
+
+# CHECK: wrssd	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrssd	dword ptr [r28 + 4*r29 + 291], r18d
+
+## wrssq
+
+# CHECK: {evex}	wrssq	qword ptr [rax + 4*rbx + 123], r9
+# CHECK: encoding: [0x62,0x74,0xfc,0x08,0x66,0x4c,0x98,0x7b]
+         {evex}	wrssq	qword ptr [rax + 4*rbx + 123], r9
+
+# CHECK: wrssq	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrssq	qword ptr [r28 + 4*r29 + 291], r19
+
+## wrussd
+
+# CHECK: {evex}	wrussd	dword ptr [rax + 4*rbx + 123], ecx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x65,0x4c,0x98,0x7b]
+         {evex}	wrussd	dword ptr [rax + 4*rbx + 123], ecx
+
+# CHECK: wrussd	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrussd	dword ptr [r28 + 4*r29 + 291], r18d
+
+## wrussq
+
+# CHECK: {evex}	wrussq	qword ptr [rax + 4*rbx + 123], r9
+# CHECK: encoding: [0x62,0x74,0xfd,0x08,0x65,0x4c,0x98,0x7b]
+         {evex}	wrussq	qword ptr [rax + 4*rbx + 123], r9
+
+# CHECK: wrussq	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrussq	qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/cmpccxadd-att.s b/llvm/test/MC/X86/apx/cmpccxadd-att.s
index ce23588a18499..7ff803ad79ecb 100644
--- a/llvm/test/MC/X86/apx/cmpccxadd-att.s
+++ b/llvm/test/MC/X86/apx/cmpccxadd-att.s
@@ -1,8 +1,32 @@
 # RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-30: error:
+# ERROR-COUNT-60: error:
 # ERROR-NOT: error:
+# CHECK: {evex}	cmpaxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b]
+         {evex}	cmpaxadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpaxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b]
+         {evex}	cmpaxadd	%r9, %r15, 123(%rax,%rbx,4)
+
+# CHECK: cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: {evex}	cmpbexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe6,0x54,0x98,0x7b]
+         {evex}	cmpbexadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpbexadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe6,0x7c,0x98,0x7b]
+         {evex}	cmpbexadd	%r9, %r15, 123(%rax,%rbx,4)
+
 # CHECK: cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
@@ -11,6 +35,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpbexadd	%r19, %r23, 291(%r28,%r29,4)
 
+# CHECK: {evex}	cmpbxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe2,0x54,0x98,0x7b]
+         {evex}	cmpbxadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpbxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe2,0x7c,0x98,0x7b]
+         {evex}	cmpbxadd	%r9, %r15, 123(%rax,%rbx,4)
+
 # CHECK: cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
@@ -19,6 +51,62 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpbxadd	%r19, %r23, 291(%r28,%r29,4)
 
+# CHECK: {evex}	cmpexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b]
+         {evex}	cmpexadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpexadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b]
+         {evex}	cmpexadd	%r9, %r15, 123(%rax,%rbx,4)
+
+# CHECK: cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: {evex}	cmpgexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b]
+         {evex}	cmpgexadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpgexadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b]
+         {evex}	cmpgexadd	%r9, %r15, 123(%rax,%rbx,4)
+
+# CHECK: cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: {evex}	cmpgxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b]
+         {evex}	cmpgxadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpgxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b]
+         {evex}	cmpgxadd	%r9, %r15, 123(%rax,%rbx,4)
+
+# CHECK: cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: {evex}	cmplexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xee,0x54,0x98,0x7b]
+         {evex}	cmplexadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmplexadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xee,0x7c,0x98,0x7b]
+         {evex}	cmplexadd	%r9, %r15, 123(%rax,%rbx,4)
+
 # CHECK: cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
@@ -27,6 +115,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmplexadd	%r19, %r23, 291(%r28,%r29,4)
 
+# CHECK: {evex}	cmplxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xec,0x54,0x98,0x7b]
+         {evex}	cmplxadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmplxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xec,0x7c,0x98,0x7b]
+         {evex}	cmplxadd	%r9, %r15, 123(%rax,%rbx,4)
+
 # CHECK: cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
@@ -35,29 +131,29 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmplxadd	%r19, %r23, 291(%r28,%r29,4)
 
-# CHECK: cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: {evex}	cmpnexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b]
+         {evex}	cmpnexadd	%ecx, %edx, 123(%rax,%rbx,4)
 
-# CHECK: cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: {evex}	cmpnexadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b]
+         {evex}	cmpnexadd	%r9, %r15, 123(%rax,%rbx,4)
 
-# CHECK: cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
 
-# CHECK: cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
 
-# CHECK: cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: {evex}	cmpnoxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe1,0x54,0x98,0x7b]
+         {evex}	cmpnoxadd	%ecx, %edx, 123(%rax,%rbx,4)
 
-# CHECK: cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: {evex}	cmpnoxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe1,0x7c,0x98,0x7b]
+         {evex}	cmpnoxadd	%r9, %r15, 123(%rax,%rbx,4)
 
 # CHECK: cmpnoxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe1,0xb4,0xac,0x23,0x01,0x00,0x00]
@@ -67,6 +163,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpnoxadd	%r19, %r23, 291(%r28,%r29,4)
 
+# CHECK: {evex}	cmpnpxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xeb,0x54,0x98,0x7b]
+         {evex}	cmpnpxadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpnpxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xeb,0x7c,0x98,0x7b]
+         {evex}	cmpnpxadd	%r9, %r15, 123(%rax,%rbx,4)
+
 # CHECK: cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
@@ -75,6 +179,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpnpxadd	%r19, %r23, 291(%r28,%r29,4)
 
+# CHECK: {evex}	cmpnsxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe9,0x54,0x98,0x7b]
+         {evex}	cmpnsxadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpnsxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe9,0x7c,0x98,0x7b]
+         {evex}	cmpnsxadd	%r9, %r15, 123(%rax,%rbx,4)
+
 # CHECK: cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
@@ -83,13 +195,13 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpnsxadd	%r19, %r23, 291(%r28,%r29,4)
 
-# CHECK: cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: {evex}	cmpoxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe0,0x54,0x98,0x7b]
+         {evex}	cmpoxadd	%ecx, %edx, 123(%rax,%rbx,4)
 
-# CHECK: cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: {evex}	cmpoxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe0,0x7c,0x98,0x7b]
+         {evex}	cmpoxadd	%r9, %r15, 123(%rax,%rbx,4)
 
 # CHECK: cmpoxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe0,0xb4,0xac,0x23,0x01,0x00,0x00]
@@ -99,6 +211,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpoxadd	%r19, %r23, 291(%r28,%r29,4)
 
+# CHECK: {evex}	cmppxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xea,0x54,0x98,0x7b]
+         {evex}	cmppxadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmppxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xea,0x7c,0x98,0x7b]
+         {evex}	cmppxadd	%r9, %r15, 123(%rax,%rbx,4)
+
 # CHECK: cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
@@ -107,6 +227,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmppxadd	%r19, %r23, 291(%r28,%r29,4)
 
+# CHECK: {evex}	cmpsxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe8,0x54,0x98,0x7b]
+         {evex}	cmpsxadd	%ecx, %edx, 123(%rax,%rbx,4)
+
+# CHECK: {evex}	cmpsxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe8,0x7c,0x98,0x7b]
+         {evex}	cmpsxadd	%r9, %r15, 123(%rax,%rbx,4)
+
 # CHECK: cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
@@ -114,11 +242,3 @@
 # CHECK: cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
-
-# CHECK: cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
-
-# CHECK: cmpexadd	%r19, %r23, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpexadd	%r19, %r23, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/cmpccxadd-intel.s b/llvm/test/MC/X86/apx/cmpccxadd-intel.s
index c2630d3d9273b..cace33e59d6a7 100644
--- a/llvm/test/MC/X86/apx/cmpccxadd-intel.s
+++ b/llvm/test/MC/X86/apx/cmpccxadd-intel.s
@@ -1,5 +1,29 @@
 # RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
+# CHECK: {evex}	cmpaxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b]
+         {evex}	cmpaxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpaxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b]
+         {evex}	cmpaxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
+# CHECK: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: {evex}	cmpbexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe6,0x54,0x98,0x7b]
+         {evex}	cmpbexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpbexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe6,0x7c,0x98,0x7b]
+         {evex}	cmpbexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
 # CHECK: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -8,6 +32,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
+# CHECK: {evex}	cmpbxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe2,0x54,0x98,0x7b]
+         {evex}	cmpbxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpbxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe2,0x7c,0x98,0x7b]
+         {evex}	cmpbxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
 # CHECK: cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -16,6 +48,62 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
+# CHECK: {evex}	cmpexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b]
+         {evex}	cmpexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b]
+         {evex}	cmpexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
+# CHECK: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: {evex}	cmpgexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b]
+         {evex}	cmpgexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpgexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b]
+         {evex}	cmpgexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
+# CHECK: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: {evex}	cmpgxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b]
+         {evex}	cmpgxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpgxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b]
+         {evex}	cmpgxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
+# CHECK: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: {evex}	cmplexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xee,0x54,0x98,0x7b]
+         {evex}	cmplexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmplexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xee,0x7c,0x98,0x7b]
+         {evex}	cmplexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
 # CHECK: cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -24,6 +112,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmplexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
+# CHECK: {evex}	cmplxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xec,0x54,0x98,0x7b]
+         {evex}	cmplxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmplxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xec,0x7c,0x98,0x7b]
+         {evex}	cmplxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
 # CHECK: cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -32,29 +128,29 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
-# CHECK: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: {evex}	cmpnexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b]
+         {evex}	cmpnexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 
-# CHECK: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: {evex}	cmpnexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b]
+         {evex}	cmpnexadd	qword ptr [rax + 4*rbx + 123], r15, r9
 
-# CHECK: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 
-# CHECK: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
-# CHECK: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: {evex}	cmpnoxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe1,0x54,0x98,0x7b]
+         {evex}	cmpnoxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 
-# CHECK: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: {evex}	cmpnoxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe1,0x7c,0x98,0x7b]
+         {evex}	cmpnoxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 
 # CHECK: cmpnoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe1,0xb4,0xac,0x23,0x01,0x00,0x00]
@@ -64,6 +160,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpnoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
+# CHECK: {evex}	cmpnpxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xeb,0x54,0x98,0x7b]
+         {evex}	cmpnpxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpnpxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xeb,0x7c,0x98,0x7b]
+         {evex}	cmpnpxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
 # CHECK: cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -72,6 +176,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpnpxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
+# CHECK: {evex}	cmpnsxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe9,0x54,0x98,0x7b]
+         {evex}	cmpnsxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpnsxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe9,0x7c,0x98,0x7b]
+         {evex}	cmpnsxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
 # CHECK: cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -80,13 +192,13 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpnsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
-# CHECK: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: {evex}	cmpoxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe0,0x54,0x98,0x7b]
+         {evex}	cmpoxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 
-# CHECK: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: {evex}	cmpoxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe0,0x7c,0x98,0x7b]
+         {evex}	cmpoxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 
 # CHECK: cmpoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe0,0xb4,0xac,0x23,0x01,0x00,0x00]
@@ -96,6 +208,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
+# CHECK: {evex}	cmppxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xea,0x54,0x98,0x7b]
+         {evex}	cmppxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmppxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xea,0x7c,0x98,0x7b]
+         {evex}	cmppxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
 # CHECK: cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -104,6 +224,14 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmppxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
+# CHECK: {evex}	cmpsxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe8,0x54,0x98,0x7b]
+         {evex}	cmpsxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+
+# CHECK: {evex}	cmpsxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe8,0x7c,0x98,0x7b]
+         {evex}	cmpsxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+
 # CHECK: cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00]
          cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
@@ -111,11 +239,3 @@
 # CHECK: cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-
-# CHECK: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
-
-# CHECK: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
diff --git a/llvm/test/MC/X86/apx/invept-att.s b/llvm/test/MC/X86/apx/invept-att.s
deleted file mode 100644
index 42840bf0b7308..0000000000000
--- a/llvm/test/MC/X86/apx/invept-att.s
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-1: error:
-# ERROR-NOT: error:
-# CHECK: invept	123(%r28,%r29,4), %r19
-# CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf0,0x5c,0xac,0x7b]
-         invept	123(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/invept-intel.s b/llvm/test/MC/X86/apx/invept-intel.s
deleted file mode 100644
index 1c5fac234570d..0000000000000
--- a/llvm/test/MC/X86/apx/invept-intel.s
+++ /dev/null
@@ -1,5 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: invept	r19, xmmword ptr [r28 + 4*r29 + 123]
-# CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf0,0x5c,0xac,0x7b]
-         invept	r19, xmmword ptr [r28 + 4*r29 + 123]
diff --git a/llvm/test/MC/X86/apx/invpcid-att.s b/llvm/test/MC/X86/apx/invpcid-att.s
index a0e827ecadb05..57dee2e597a64 100644
--- a/llvm/test/MC/X86/apx/invpcid-att.s
+++ b/llvm/test/MC/X86/apx/invpcid-att.s
@@ -1,8 +1,12 @@
 # RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-1: error:
+# ERROR-COUNT-2: error:
 # ERROR-NOT: error:
+# CHECK: {evex}	invpcid	123(%rax,%rbx,4), %r9
+# CHECK: encoding: [0x62,0x74,0x7e,0x08,0xf2,0x4c,0x98,0x7b]
+         {evex}	invpcid	123(%rax,%rbx,4), %r9
+
 # CHECK: invpcid	291(%r28,%r29,4), %r19
 # CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf2,0x9c,0xac,0x23,0x01,0x00,0x00]
          invpcid	291(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/invpcid-intel.s b/llvm/test/MC/X86/apx/invpcid-intel.s
index e9993d26962e2..d959af7b14959 100644
--- a/llvm/test/MC/X86/apx/invpcid-intel.s
+++ b/llvm/test/MC/X86/apx/invpcid-intel.s
@@ -1,5 +1,9 @@
 # RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
+# CHECK: {evex}	invpcid	r9, xmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7e,0x08,0xf2,0x4c,0x98,0x7b]
+         {evex}	invpcid	r9, xmmword ptr [rax + 4*rbx + 123]
+
 # CHECK: invpcid	r19, xmmword ptr [r28 + 4*r29 + 291]
 # CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf2,0x9c,0xac,0x23,0x01,0x00,0x00]
          invpcid	r19, xmmword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/invvpid-att.s b/llvm/test/MC/X86/apx/invvpid-att.s
deleted file mode 100644
index a074891214606..0000000000000
--- a/llvm/test/MC/X86/apx/invvpid-att.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-1: error:
-# ERROR-NOT: error:
-# CHECK: invvpid	291(%r28,%r29,4), %r19
-# CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00]
-         invvpid	291(%r28,%r29,4), %r19
-
diff --git a/llvm/test/MC/X86/apx/invvpid-intel.s b/llvm/test/MC/X86/apx/invvpid-intel.s
deleted file mode 100644
index cc4e3e9a618d6..0000000000000
--- a/llvm/test/MC/X86/apx/invvpid-intel.s
+++ /dev/null
@@ -1,5 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: invvpid	r19, xmmword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00]
-         invvpid	r19, xmmword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/movdir64b-att.s b/llvm/test/MC/X86/apx/movdir64b-att.s
index bc8f1a90c9ed6..2956c5fd01298 100644
--- a/llvm/test/MC/X86/apx/movdir64b-att.s
+++ b/llvm/test/MC/X86/apx/movdir64b-att.s
@@ -1,8 +1,16 @@
 # RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-2: error:
+# ERROR-COUNT-4: error:
 # ERROR-NOT: error:
+# CHECK: {evex}	movdir64b	123(%eax,%ebx,4), %ecx
+# CHECK: encoding: [0x67,0x62,0xf4,0x7d,0x08,0xf8,0x4c,0x98,0x7b]
+         {evex}	movdir64b	123(%eax,%ebx,4), %ecx
+
+# CHECK: {evex}	movdir64b	123(%rax,%rbx,4), %r9
+# CHECK: encoding: [0x62,0x74,0x7d,0x08,0xf8,0x4c,0x98,0x7b]
+         {evex}	movdir64b	123(%rax,%rbx,4), %r9
+
 # CHECK: movdir64b	291(%r28d,%r29d,4), %r18d
 # CHECK: encoding: [0x67,0x62,0x8c,0x79,0x08,0xf8,0x94,0xac,0x23,0x01,0x00,0x00]
          movdir64b	291(%r28d,%r29d,4), %r18d
diff --git a/llvm/test/MC/X86/apx/movdir64b-intel.s b/llvm/test/MC/X86/apx/movdir64b-intel.s
index b34efefeba2da..fac9bb9ecd533 100644
--- a/llvm/test/MC/X86/apx/movdir64b-intel.s
+++ b/llvm/test/MC/X86/apx/movdir64b-intel.s
@@ -1,5 +1,13 @@
 # RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
+# CHECK: {evex}	movdir64b	ecx, zmmword ptr [eax + 4*ebx + 123]
+# CHECK: encoding: [0x67,0x62,0xf4,0x7d,0x08,0xf8,0x4c,0x98,0x7b]
+         {evex}	movdir64b	ecx, zmmword ptr [eax + 4*ebx + 123]
+
+# CHECK: {evex}	movdir64b	r9, zmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7d,0x08,0xf8,0x4c,0x98,0x7b]
+         {evex}	movdir64b	r9, zmmword ptr [rax + 4*rbx + 123]
+
 # CHECK: movdir64b	r18d, zmmword ptr [r28d + 4*r29d + 291]
 # CHECK: encoding: [0x67,0x62,0x8c,0x79,0x08,0xf8,0x94,0xac,0x23,0x01,0x00,0x00]
          movdir64b	r18d, zmmword ptr [r28d + 4*r29d + 291]
diff --git a/llvm/test/MC/X86/apx/movdiri-att.s b/llvm/test/MC/X86/apx/movdiri-att.s
index 8bdabf232f5de..f951e31404558 100644
--- a/llvm/test/MC/X86/apx/movdiri-att.s
+++ b/llvm/test/MC/X86/apx/movdiri-att.s
@@ -1,8 +1,16 @@
 # RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-2: error:
+# ERROR-COUNT-4: error:
 # ERROR-NOT: error:
+# CHECK: {evex}	movdiri	%ecx, 123(%eax,%ebx,4)
+# CHECK: encoding: [0x67,0x62,0xf4,0x7c,0x08,0xf9,0x4c,0x98,0x7b]
+         {evex}	movdiri	%ecx, 123(%eax,%ebx,4)
+
+# CHECK: {evex}	movdiri	%r9, 123(%rax,%rbx,4)
+# CHECK: encoding: [0x62,0x74,0xfc,0x08,0xf9,0x4c,0x98,0x7b]
+         {evex}	movdiri	%r9, 123(%rax,%rbx,4)
+
 # CHECK: movdiri	%r18d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8c,0x78,0x08,0xf9,0x94,0xac,0x23,0x01,0x00,0x00]
          movdiri	%r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/movdiri-intel.s b/llvm/test/MC/X86/apx/movdiri-intel.s
index 1a38384f9f96e..16a5432253e72 100644
--- a/llvm/test/MC/X86/apx/movdiri-intel.s
+++ b/llvm/test/MC/X86/apx/movdiri-intel.s
@@ -1,5 +1,13 @@
 # RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
+# CHECK: {evex}	movdiri	dword ptr [eax + 4*ebx + 123], ecx
+# CHECK: encoding: [0x67,0x62,0xf4,0x7c,0x08,0xf9,0x4c,0x98,0x7b]
+         {evex}	movdiri	dword ptr [eax + 4*ebx + 123], ecx
+
+# CHECK: {evex}	movdiri	qword ptr [rax + 4*rbx + 123], r9
+# CHECK: encoding: [0x62,0x74,0xfc,0x08,0xf9,0x4c,0x98,0x7b]
+         {evex}	movdiri	qword ptr [rax + 4*rbx + 123], r9
+
 # CHECK: movdiri	dword ptr [r28 + 4*r29 + 291], r18d
 # CHECK: encoding: [0x62,0x8c,0x78,0x08,0xf9,0x94,0xac,0x23,0x01,0x00,0x00]
          movdiri	dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/mulx-att.s b/llvm/test/MC/X86/apx/mulx-att.s
deleted file mode 100644
index 976a79f469cd6..0000000000000
--- a/llvm/test/MC/X86/apx/mulx-att.s
+++ /dev/null
@@ -1,20 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-4: error:
-# ERROR-NOT: error:
-# CHECK: mulxl	%r18d, %r22d, %r26d
-# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf6,0xd2]
-         mulxl	%r18d, %r22d, %r26d
-
-# CHECK: mulxq	%r19, %r23, %r27
-# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf6,0xdb]
-         mulxq	%r19, %r23, %r27
-
-# CHECK: mulxl	291(%r28,%r29,4), %r18d, %r22d
-# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00]
-         mulxl	291(%r28,%r29,4), %r18d, %r22d
-
-# CHECK: mulxq	291(%r28,%r29,4), %r19, %r23
-# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00]
-         mulxq	291(%r28,%r29,4), %r19, %r23
diff --git a/llvm/test/MC/X86/apx/mulx-intel.s b/llvm/test/MC/X86/apx/mulx-intel.s
deleted file mode 100644
index 3db587502915d..0000000000000
--- a/llvm/test/MC/X86/apx/mulx-intel.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: mulx	r26d, r22d, r18d
-# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf6,0xd2]
-         mulx	r26d, r22d, r18d
-
-# CHECK: mulx	r27, r23, r19
-# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf6,0xdb]
-         mulx	r27, r23, r19
-
-# CHECK: mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00]
-         mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
-
-# CHECK: mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00]
-         mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/pdep-att.s b/llvm/test/MC/X86/apx/pdep-att.s
deleted file mode 100644
index c319b17e47f6f..0000000000000
--- a/llvm/test/MC/X86/apx/pdep-att.s
+++ /dev/null
@@ -1,20 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-4: error:
-# ERROR-NOT: error:
-# CHECK: pdepl	%r18d, %r22d, %r26d
-# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf5,0xd2]
-         pdepl	%r18d, %r22d, %r26d
-
-# CHECK: pdepq	%r19, %r23, %r27
-# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf5,0xdb]
-         pdepq	%r19, %r23, %r27
-
-# CHECK: pdepl	291(%r28,%r29,4), %r18d, %r22d
-# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
-         pdepl	291(%r28,%r29,4), %r18d, %r22d
-
-# CHECK: pdepq	291(%r28,%r29,4), %r19, %r23
-# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
-         pdepq	291(%r28,%r29,4), %r19, %r23
diff --git a/llvm/test/MC/X86/apx/pdep-intel.s b/llvm/test/MC/X86/apx/pdep-intel.s
deleted file mode 100644
index 0f9e828c021c3..0000000000000
--- a/llvm/test/MC/X86/apx/pdep-intel.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: pdep	r26d, r22d, r18d
-# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf5,0xd2]
-         pdep	r26d, r22d, r18d
-
-# CHECK: pdep	r27, r23, r19
-# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf5,0xdb]
-         pdep	r27, r23, r19
-
-# CHECK: pdep	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
-         pdep	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
-
-# CHECK: pdep	r23, r19, qword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
-         pdep	r23, r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/pext-att.s b/llvm/test/MC/X86/apx/pext-att.s
deleted file mode 100644
index c07fa1ac2082a..0000000000000
--- a/llvm/test/MC/X86/apx/pext-att.s
+++ /dev/null
@@ -1,20 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-4: error:
-# ERROR-NOT: error:
-# CHECK: pextl	%r18d, %r22d, %r26d
-# CHECK: encoding: [0x62,0x6a,0x4e,0x00,0xf5,0xd2]
-         pextl	%r18d, %r22d, %r26d
-
-# CHECK: pextq	%r19, %r23, %r27
-# CHECK: encoding: [0x62,0x6a,0xc6,0x00,0xf5,0xdb]
-         pextq	%r19, %r23, %r27
-
-# CHECK: pextl	291(%r28,%r29,4), %r18d, %r22d
-# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
-         pextl	291(%r28,%r29,4), %r18d, %r22d
-
-# CHECK: pextq	291(%r28,%r29,4), %r19, %r23
-# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
-         pextq	291(%r28,%r29,4), %r19, %r23
diff --git a/llvm/test/MC/X86/apx/pext-intel.s b/llvm/test/MC/X86/apx/pext-intel.s
deleted file mode 100644
index 9a7e7d93094a4..0000000000000
--- a/llvm/test/MC/X86/apx/pext-intel.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: pext	r26d, r22d, r18d
-# CHECK: encoding: [0x62,0x6a,0x4e,0x00,0xf5,0xd2]
-         pext	r26d, r22d, r18d
-
-# CHECK: pext	r27, r23, r19
-# CHECK: encoding: [0x62,0x6a,0xc6,0x00,0xf5,0xdb]
-         pext	r27, r23, r19
-
-# CHECK: pext	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
-         pext	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
-
-# CHECK: pext	r23, r19, qword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
-         pext	r23, r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/rorx-att.s b/llvm/test/MC/X86/apx/rorx-att.s
deleted file mode 100644
index fb613d95c7cb4..0000000000000
--- a/llvm/test/MC/X86/apx/rorx-att.s
+++ /dev/null
@@ -1,20 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-4: error:
-# ERROR-NOT: error:
-# CHECK: rorxl	$123, %r18d, %r22d
-# CHECK: encoding: [0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b]
-         rorxl	$123, %r18d, %r22d
-
-# CHECK: rorxq	$123, %r19, %r23
-# CHECK: encoding: [0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b]
-         rorxq	$123, %r19, %r23
-
-# CHECK: rorxl	$123, 291(%r28,%r29,4), %r18d
-# CHECK: encoding: [0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b]
-         rorxl	$123, 291(%r28,%r29,4), %r18d
-
-# CHECK: rorxq	$123, 291(%r28,%r29,4), %r19
-# CHECK: encoding: [0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b]
-         rorxq	$123, 291(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/rorx-intel.s b/llvm/test/MC/X86/apx/rorx-intel.s
deleted file mode 100644
index d3e63559cba57..0000000000000
--- a/llvm/test/MC/X86/apx/rorx-intel.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: rorx	r22d, r18d, 123
-# CHECK: encoding: [0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b]
-         rorx	r22d, r18d, 123
-
-# CHECK: rorx	r23, r19, 123
-# CHECK: encoding: [0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b]
-         rorx	r23, r19, 123
-
-# CHECK: rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
-# CHECK: encoding: [0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b]
-         rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
-
-# CHECK: rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
-# CHECK: encoding: [0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b]
-         rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
diff --git a/llvm/test/MC/X86/apx/sarx-att.s b/llvm/test/MC/X86/apx/sarx-att.s
deleted file mode 100644
index a174903d976cb..0000000000000
--- a/llvm/test/MC/X86/apx/sarx-att.s
+++ /dev/null
@@ -1,20 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-4: error:
-# ERROR-NOT: error:
-# CHECK: sarxl	%r18d, %r22d, %r26d
-# CHECK: encoding: [0x62,0x6a,0x6e,0x00,0xf7,0xd6]
-         sarxl	%r18d, %r22d, %r26d
-
-# CHECK: sarxl	%r18d, 291(%r28,%r29,4), %r22d
-# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         sarxl	%r18d, 291(%r28,%r29,4), %r22d
-
-# CHECK: sarxq	%r19, %r23, %r27
-# CHECK: encoding: [0x62,0x6a,0xe6,0x00,0xf7,0xdf]
-         sarxq	%r19, %r23, %r27
-
-# CHECK: sarxq	%r19, 291(%r28,%r29,4), %r23
-# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         sarxq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/sarx-intel.s b/llvm/test/MC/X86/apx/sarx-intel.s
deleted file mode 100644
index 962b6ec313b98..0000000000000
--- a/llvm/test/MC/X86/apx/sarx-intel.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: sarx	r26d, r22d, r18d
-# CHECK: encoding: [0x62,0x6a,0x6e,0x00,0xf7,0xd6]
-         sarx	r26d, r22d, r18d
-
-# CHECK: sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-
-# CHECK: sarx	r27, r23, r19
-# CHECK: encoding: [0x62,0x6a,0xe6,0x00,0xf7,0xdf]
-         sarx	r27, r23, r19
-
-# CHECK: sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
-# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/sha-att.s b/llvm/test/MC/X86/apx/sha-att.s
new file mode 100644
index 0000000000000..3b60ad8e6c0de
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sha-att.s
@@ -0,0 +1,127 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-27: error:
+# ERROR-NOT: error:
+
+## sha1msg1
+
+# CHECK: {evex}	sha1msg1	%xmm13, %xmm12
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xd9,0xe5]
+         {evex}	sha1msg1	%xmm13, %xmm12
+
+# CHECK: {evex}	sha1msg1	123(%rax,%rbx,4), %xmm12
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xd9,0x64,0x98,0x7b]
+         {evex}	sha1msg1	123(%rax,%rbx,4), %xmm12
+
+# CHECK: sha1msg1	%xmm13, %xmm12
+# CHECK: encoding: [0x45,0x0f,0x38,0xc9,0xe5]
+         sha1msg1	%xmm13, %xmm12
+
+# CHECK: sha1msg1	291(%r28,%r29,4), %xmm12
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd9,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha1msg1	291(%r28,%r29,4), %xmm12
+
+## sha1msg2
+
+# CHECK: {evex}	sha1msg2	%xmm13, %xmm12
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xda,0xe5]
+         {evex}	sha1msg2	%xmm13, %xmm12
+
+# CHECK: {evex}	sha1msg2	123(%rax,%rbx,4), %xmm12
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xda,0x64,0x98,0x7b]
+         {evex}	sha1msg2	123(%rax,%rbx,4), %xmm12
+
+# CHECK: sha1msg2	%xmm13, %xmm12
+# CHECK: encoding: [0x45,0x0f,0x38,0xca,0xe5]
+         sha1msg2	%xmm13, %xmm12
+
+# CHECK: sha1msg2	291(%r28,%r29,4), %xmm12
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xda,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha1msg2	291(%r28,%r29,4), %xmm12
+
+## sha1nexte
+
+# CHECK: {evex}	sha1nexte	%xmm13, %xmm12
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xd8,0xe5]
+         {evex}	sha1nexte	%xmm13, %xmm12
+
+# CHECK: {evex}	sha1nexte	123(%rax,%rbx,4), %xmm12
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xd8,0x64,0x98,0x7b]
+         {evex}	sha1nexte	123(%rax,%rbx,4), %xmm12
+
+# CHECK: sha1nexte	%xmm13, %xmm12
+# CHECK: encoding: [0x45,0x0f,0x38,0xc8,0xe5]
+         sha1nexte	%xmm13, %xmm12
+
+# CHECK: sha1nexte	291(%r28,%r29,4), %xmm12
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd8,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha1nexte	291(%r28,%r29,4), %xmm12
+
+## sha1rnds4
+
+# CHECK: {evex}	sha1rnds4	$123, %xmm13, %xmm12
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xd4,0xe5,0x7b]
+         {evex}	sha1rnds4	$123, %xmm13, %xmm12
+
+# CHECK: {evex}	sha1rnds4	$123, 123(%rax,%rbx,4), %xmm12
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xd4,0x64,0x98,0x7b,0x7b]
+         {evex}	sha1rnds4	$123, 123(%rax,%rbx,4), %xmm12
+
+# CHECK: sha1rnds4	$123, %xmm13, %xmm12
+# CHECK: encoding: [0x45,0x0f,0x3a,0xcc,0xe5,0x7b]
+         sha1rnds4	$123, %xmm13, %xmm12
+
+# CHECK: sha1rnds4	$123, 291(%r28,%r29,4), %xmm12
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd4,0xa4,0xac,0x23,0x01,0x00,0x00,0x7b]
+         sha1rnds4	$123, 291(%r28,%r29,4), %xmm12
+
+## sha256msg1
+
+# CHECK: {evex}	sha256msg1	%xmm13, %xmm12
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xdc,0xe5]
+         {evex}	sha256msg1	%xmm13, %xmm12
+
+# CHECK: {evex}	sha256msg1	123(%rax,%rbx,4), %xmm12
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xdc,0x64,0x98,0x7b]
+         {evex}	sha256msg1	123(%rax,%rbx,4), %xmm12
+
+# CHECK: sha256msg1	%xmm13, %xmm12
+# CHECK: encoding: [0x45,0x0f,0x38,0xcc,0xe5]
+         sha256msg1	%xmm13, %xmm12
+
+# CHECK: sha256msg1	291(%r28,%r29,4), %xmm12
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdc,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha256msg1	291(%r28,%r29,4), %xmm12
+
+## sha256msg2
+
+# CHECK: {evex}	sha256msg2	%xmm13, %xmm12
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xdd,0xe5]
+         {evex}	sha256msg2	%xmm13, %xmm12
+
+# CHECK: {evex}	sha256msg2	123(%rax,%rbx,4), %xmm12
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xdd,0x64,0x98,0x7b]
+         {evex}	sha256msg2	123(%rax,%rbx,4), %xmm12
+
+# CHECK: sha256msg2	%xmm13, %xmm12
+# CHECK: encoding: [0x45,0x0f,0x38,0xcd,0xe5]
+         sha256msg2	%xmm13, %xmm12
+
+# CHECK: sha256msg2	291(%r28,%r29,4), %xmm12
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdd,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha256msg2	291(%r28,%r29,4), %xmm12
+
+## sha256rnds2
+
+# CHECK: {evex}	sha256rnds2	%xmm0, 123(%rax,%rbx,4), %xmm12
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xdb,0x64,0x98,0x7b]
+         {evex}	sha256rnds2	%xmm0, 123(%rax,%rbx,4), %xmm12
+
+# CHECK: sha256rnds2	%xmm0, %xmm13, %xmm12
+# CHECK: encoding: [0x45,0x0f,0x38,0xcb,0xe5]
+         sha256rnds2	%xmm0, %xmm13, %xmm12
+
+# CHECK: sha256rnds2	%xmm0, 291(%r28,%r29,4), %xmm12
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdb,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha256rnds2	%xmm0, 291(%r28,%r29,4), %xmm12
diff --git a/llvm/test/MC/X86/apx/sha-intel.s b/llvm/test/MC/X86/apx/sha-intel.s
new file mode 100644
index 0000000000000..ceb76195adeeb
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sha-intel.s
@@ -0,0 +1,123 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+## sha1msg1
+
+# CHECK: {evex}	sha1msg1	xmm12, xmm13
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xd9,0xe5]
+         {evex}	sha1msg1	xmm12, xmm13
+
+# CHECK: {evex}	sha1msg1	xmm12, xmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xd9,0x64,0x98,0x7b]
+         {evex}	sha1msg1	xmm12, xmmword ptr [rax + 4*rbx + 123]
+
+# CHECK: sha1msg1	xmm12, xmm13
+# CHECK: encoding: [0x45,0x0f,0x38,0xc9,0xe5]
+         sha1msg1	xmm12, xmm13
+
+# CHECK: sha1msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd9,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha1msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+
+## sha1msg2
+
+# CHECK: {evex}	sha1msg2	xmm12, xmm13
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xda,0xe5]
+         {evex}	sha1msg2	xmm12, xmm13
+
+# CHECK: {evex}	sha1msg2	xmm12, xmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xda,0x64,0x98,0x7b]
+         {evex}	sha1msg2	xmm12, xmmword ptr [rax + 4*rbx + 123]
+
+# CHECK: sha1msg2	xmm12, xmm13
+# CHECK: encoding: [0x45,0x0f,0x38,0xca,0xe5]
+         sha1msg2	xmm12, xmm13
+
+# CHECK: sha1msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xda,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha1msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+
+## sha1nexte
+
+# CHECK: {evex}	sha1nexte	xmm12, xmm13
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xd8,0xe5]
+         {evex}	sha1nexte	xmm12, xmm13
+
+# CHECK: {evex}	sha1nexte	xmm12, xmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xd8,0x64,0x98,0x7b]
+         {evex}	sha1nexte	xmm12, xmmword ptr [rax + 4*rbx + 123]
+
+# CHECK: sha1nexte	xmm12, xmm13
+# CHECK: encoding: [0x45,0x0f,0x38,0xc8,0xe5]
+         sha1nexte	xmm12, xmm13
+
+# CHECK: sha1nexte	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd8,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha1nexte	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+
+## sha1rnds4
+
+# CHECK: {evex}	sha1rnds4	xmm12, xmm13, 123
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xd4,0xe5,0x7b]
+         {evex}	sha1rnds4	xmm12, xmm13, 123
+
+# CHECK: {evex}	sha1rnds4	xmm12, xmmword ptr [rax + 4*rbx + 123], 123
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xd4,0x64,0x98,0x7b,0x7b]
+         {evex}	sha1rnds4	xmm12, xmmword ptr [rax + 4*rbx + 123], 123
+
+# CHECK: sha1rnds4	xmm12, xmm13, 123
+# CHECK: encoding: [0x45,0x0f,0x3a,0xcc,0xe5,0x7b]
+         sha1rnds4	xmm12, xmm13, 123
+
+# CHECK: sha1rnds4	xmm12, xmmword ptr [r28 + 4*r29 + 291], 123
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd4,0xa4,0xac,0x23,0x01,0x00,0x00,0x7b]
+         sha1rnds4	xmm12, xmmword ptr [r28 + 4*r29 + 291], 123
+
+## sha256msg1
+
+# CHECK: {evex}	sha256msg1	xmm12, xmm13
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xdc,0xe5]
+         {evex}	sha256msg1	xmm12, xmm13
+
+# CHECK: {evex}	sha256msg1	xmm12, xmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xdc,0x64,0x98,0x7b]
+         {evex}	sha256msg1	xmm12, xmmword ptr [rax + 4*rbx + 123]
+
+# CHECK: sha256msg1	xmm12, xmm13
+# CHECK: encoding: [0x45,0x0f,0x38,0xcc,0xe5]
+         sha256msg1	xmm12, xmm13
+
+# CHECK: sha256msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdc,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha256msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+
+## sha256msg2
+
+# CHECK: {evex}	sha256msg2	xmm12, xmm13
+# CHECK: encoding: [0x62,0x54,0x7c,0x08,0xdd,0xe5]
+         {evex}	sha256msg2	xmm12, xmm13
+
+# CHECK: {evex}	sha256msg2	xmm12, xmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xdd,0x64,0x98,0x7b]
+         {evex}	sha256msg2	xmm12, xmmword ptr [rax + 4*rbx + 123]
+
+# CHECK: sha256msg2	xmm12, xmm13
+# CHECK: encoding: [0x45,0x0f,0x38,0xcd,0xe5]
+         sha256msg2	xmm12, xmm13
+
+# CHECK: sha256msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdd,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha256msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
+
+## sha256rnds2
+
+# CHECK: {evex}	sha256rnds2	xmm12, xmmword ptr [rax + 4*rbx + 123], xmm0
+# CHECK: encoding: [0x62,0x74,0x7c,0x08,0xdb,0x64,0x98,0x7b]
+         {evex}	sha256rnds2	xmm12, xmmword ptr [rax + 4*rbx + 123], xmm0
+
+# CHECK: sha256rnds2	xmm12, xmm13, xmm0
+# CHECK: encoding: [0x45,0x0f,0x38,0xcb,0xe5]
+         sha256rnds2	xmm12, xmm13, xmm0
+
+# CHECK: sha256rnds2	xmm12, xmmword ptr [r28 + 4*r29 + 291], xmm0
+# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdb,0xa4,0xac,0x23,0x01,0x00,0x00]
+         sha256rnds2	xmm12, xmmword ptr [r28 + 4*r29 + 291], xmm0
diff --git a/llvm/test/MC/X86/apx/sha1msg1-att.s b/llvm/test/MC/X86/apx/sha1msg1-att.s
deleted file mode 100644
index 900b1b703c48b..0000000000000
--- a/llvm/test/MC/X86/apx/sha1msg1-att.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-
-# CHECK: sha1msg1	%xmm13, %xmm12
-# CHECK: encoding: [0x45,0x0f,0x38,0xc9,0xe5]
-         sha1msg1	%xmm13, %xmm12
-
-# CHECK: sha1msg1	291(%r28,%r29,4), %xmm12
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd9,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha1msg1	291(%r28,%r29,4), %xmm12
diff --git a/llvm/test/MC/X86/apx/sha1msg1-intel.s b/llvm/test/MC/X86/apx/sha1msg1-intel.s
deleted file mode 100644
index d347a671069f5..0000000000000
--- a/llvm/test/MC/X86/apx/sha1msg1-intel.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: sha1msg1	xmm12, xmm13
-# CHECK: encoding: [0x45,0x0f,0x38,0xc9,0xe5]
-         sha1msg1	xmm12, xmm13
-
-# CHECK: sha1msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd9,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha1msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/sha1msg2-att.s b/llvm/test/MC/X86/apx/sha1msg2-att.s
deleted file mode 100644
index 62557e46f8b9d..0000000000000
--- a/llvm/test/MC/X86/apx/sha1msg2-att.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-
-# CHECK: sha1msg2	%xmm13, %xmm12
-# CHECK: encoding: [0x45,0x0f,0x38,0xca,0xe5]
-         sha1msg2	%xmm13, %xmm12
-
-# CHECK: sha1msg2	291(%r28,%r29,4), %xmm12
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xda,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha1msg2	291(%r28,%r29,4), %xmm12
diff --git a/llvm/test/MC/X86/apx/sha1msg2-intel.s b/llvm/test/MC/X86/apx/sha1msg2-intel.s
deleted file mode 100644
index 546a56263bbe2..0000000000000
--- a/llvm/test/MC/X86/apx/sha1msg2-intel.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: sha1msg2	xmm12, xmm13
-# CHECK: encoding: [0x45,0x0f,0x38,0xca,0xe5]
-         sha1msg2	xmm12, xmm13
-
-# CHECK: sha1msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xda,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha1msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/sha1nexte-att.s b/llvm/test/MC/X86/apx/sha1nexte-att.s
deleted file mode 100644
index 70e8300cb70a7..0000000000000
--- a/llvm/test/MC/X86/apx/sha1nexte-att.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-
-# CHECK: sha1nexte	%xmm13, %xmm12
-# CHECK: encoding: [0x45,0x0f,0x38,0xc8,0xe5]
-         sha1nexte	%xmm13, %xmm12
-
-# CHECK: sha1nexte	291(%r28,%r29,4), %xmm12
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd8,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha1nexte	291(%r28,%r29,4), %xmm12
diff --git a/llvm/test/MC/X86/apx/sha1nexte-intel.s b/llvm/test/MC/X86/apx/sha1nexte-intel.s
deleted file mode 100644
index 1c890c3cda44a..0000000000000
--- a/llvm/test/MC/X86/apx/sha1nexte-intel.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: sha1nexte	xmm12, xmm13
-# CHECK: encoding: [0x45,0x0f,0x38,0xc8,0xe5]
-         sha1nexte	xmm12, xmm13
-
-# CHECK: sha1nexte	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd8,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha1nexte	xmm12, xmmword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/sha1rnds4-att.s b/llvm/test/MC/X86/apx/sha1rnds4-att.s
deleted file mode 100644
index 1d24c83a0b30e..0000000000000
--- a/llvm/test/MC/X86/apx/sha1rnds4-att.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-
-# CHECK: sha1rnds4	$123, %xmm13, %xmm12
-# CHECK: encoding: [0x45,0x0f,0x3a,0xcc,0xe5,0x7b]
-         sha1rnds4	$123, %xmm13, %xmm12
-
-# CHECK: sha1rnds4	$123, 291(%r28,%r29,4), %xmm12
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd4,0xa4,0xac,0x23,0x01,0x00,0x00,0x7b]
-         sha1rnds4	$123, 291(%r28,%r29,4), %xmm12
diff --git a/llvm/test/MC/X86/apx/sha1rnds4-intel.s b/llvm/test/MC/X86/apx/sha1rnds4-intel.s
deleted file mode 100644
index 53620856bbf0f..0000000000000
--- a/llvm/test/MC/X86/apx/sha1rnds4-intel.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: sha1rnds4	xmm12, xmm13, 123
-# CHECK: encoding: [0x45,0x0f,0x3a,0xcc,0xe5,0x7b]
-         sha1rnds4	xmm12, xmm13, 123
-
-# CHECK: sha1rnds4	xmm12, xmmword ptr [r28 + 4*r29 + 291], 123
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xd4,0xa4,0xac,0x23,0x01,0x00,0x00,0x7b]
-         sha1rnds4	xmm12, xmmword ptr [r28 + 4*r29 + 291], 123
diff --git a/llvm/test/MC/X86/apx/sha256msg1-att.s b/llvm/test/MC/X86/apx/sha256msg1-att.s
deleted file mode 100644
index c6d833dc78039..0000000000000
--- a/llvm/test/MC/X86/apx/sha256msg1-att.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-
-# CHECK: sha256msg1	%xmm13, %xmm12
-# CHECK: encoding: [0x45,0x0f,0x38,0xcc,0xe5]
-         sha256msg1	%xmm13, %xmm12
-
-# CHECK: sha256msg1	291(%r28,%r29,4), %xmm12
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdc,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha256msg1	291(%r28,%r29,4), %xmm12
diff --git a/llvm/test/MC/X86/apx/sha256msg1-intel.s b/llvm/test/MC/X86/apx/sha256msg1-intel.s
deleted file mode 100644
index e3e96f9e2f7d5..0000000000000
--- a/llvm/test/MC/X86/apx/sha256msg1-intel.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: sha256msg1	xmm12, xmm13
-# CHECK: encoding: [0x45,0x0f,0x38,0xcc,0xe5]
-         sha256msg1	xmm12, xmm13
-
-# CHECK: sha256msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdc,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha256msg1	xmm12, xmmword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/sha256msg2-att.s b/llvm/test/MC/X86/apx/sha256msg2-att.s
deleted file mode 100644
index 96528d90c3f4a..0000000000000
--- a/llvm/test/MC/X86/apx/sha256msg2-att.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-
-# CHECK: sha256msg2	%xmm13, %xmm12
-# CHECK: encoding: [0x45,0x0f,0x38,0xcd,0xe5]
-         sha256msg2	%xmm13, %xmm12
-
-# CHECK: sha256msg2	291(%r28,%r29,4), %xmm12
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdd,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha256msg2	291(%r28,%r29,4), %xmm12
diff --git a/llvm/test/MC/X86/apx/sha256msg2-intel.s b/llvm/test/MC/X86/apx/sha256msg2-intel.s
deleted file mode 100644
index 043633de1c041..0000000000000
--- a/llvm/test/MC/X86/apx/sha256msg2-intel.s
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: sha256msg2	xmm12, xmm13
-# CHECK: encoding: [0x45,0x0f,0x38,0xcd,0xe5]
-         sha256msg2	xmm12, xmm13
-
-# CHECK: sha256msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdd,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha256msg2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/sha256rnds2-att.s b/llvm/test/MC/X86/apx/sha256rnds2-att.s
deleted file mode 100644
index 3071d40babf5b..0000000000000
--- a/llvm/test/MC/X86/apx/sha256rnds2-att.s
+++ /dev/null
@@ -1,13 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-
-# CHECK: sha256rnds2	%xmm0, %xmm13, %xmm12
-# CHECK: encoding: [0x45,0x0f,0x38,0xcb,0xe5]
-         sha256rnds2	%xmm0, %xmm13, %xmm12
-
-# CHECK: sha256rnds2	%xmm0, 291(%r28,%r29,4), %xmm12
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdb,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha256rnds2	%xmm0, 291(%r28,%r29,4), %xmm12
-
-# CHECK: sha256rnds2	%xmm0, 291(%r28,%r29,4), %xmm12
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdb,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha256rnds2	291(%r28,%r29,4), %xmm12
diff --git a/llvm/test/MC/X86/apx/sha256rnds2-intel.s b/llvm/test/MC/X86/apx/sha256rnds2-intel.s
deleted file mode 100644
index 7630f3c2cd22f..0000000000000
--- a/llvm/test/MC/X86/apx/sha256rnds2-intel.s
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: sha256rnds2	xmm12, xmm13, xmm0
-# CHECK: encoding: [0x45,0x0f,0x38,0xcb,0xe5]
-         sha256rnds2	xmm12, xmm13, xmm0
-
-# CHECK: sha256rnds2	xmm12, xmmword ptr [r28 + 4*r29 + 291], xmm0
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdb,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha256rnds2	xmm12, xmmword ptr [r28 + 4*r29 + 291], xmm0
-
-# CHECK: sha256rnds2	xmm12, xmmword ptr [r28 + 4*r29 + 291], xmm0
-# CHECK: encoding: [0x62,0x1c,0x78,0x08,0xdb,0xa4,0xac,0x23,0x01,0x00,0x00]
-         sha256rnds2	xmm12, xmmword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/shlx-att.s b/llvm/test/MC/X86/apx/shlx-att.s
deleted file mode 100644
index 4e28119f08305..0000000000000
--- a/llvm/test/MC/X86/apx/shlx-att.s
+++ /dev/null
@@ -1,20 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-4: error:
-# ERROR-NOT: error:
-# CHECK: shlxl	%r18d, %r22d, %r26d
-# CHECK: encoding: [0x62,0x6a,0x6d,0x00,0xf7,0xd6]
-         shlxl	%r18d, %r22d, %r26d
-
-# CHECK: shlxl	%r18d, 291(%r28,%r29,4), %r22d
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         shlxl	%r18d, 291(%r28,%r29,4), %r22d
-
-# CHECK: shlxq	%r19, %r23, %r27
-# CHECK: encoding: [0x62,0x6a,0xe5,0x00,0xf7,0xdf]
-         shlxq	%r19, %r23, %r27
-
-# CHECK: shlxq	%r19, 291(%r28,%r29,4), %r23
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         shlxq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/shlx-intel.s b/llvm/test/MC/X86/apx/shlx-intel.s
deleted file mode 100644
index 9f16918a712dc..0000000000000
--- a/llvm/test/MC/X86/apx/shlx-intel.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: shlx	r26d, r22d, r18d
-# CHECK: encoding: [0x62,0x6a,0x6d,0x00,0xf7,0xd6]
-         shlx	r26d, r22d, r18d
-
-# CHECK: shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-
-# CHECK: shlx	r27, r23, r19
-# CHECK: encoding: [0x62,0x6a,0xe5,0x00,0xf7,0xdf]
-         shlx	r27, r23, r19
-
-# CHECK: shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
-# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/shrx-att.s b/llvm/test/MC/X86/apx/shrx-att.s
deleted file mode 100644
index d9bb5f84af73d..0000000000000
--- a/llvm/test/MC/X86/apx/shrx-att.s
+++ /dev/null
@@ -1,20 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-4: error:
-# ERROR-NOT: error:
-# CHECK: shrxl	%r18d, %r22d, %r26d
-# CHECK: encoding: [0x62,0x6a,0x6f,0x00,0xf7,0xd6]
-         shrxl	%r18d, %r22d, %r26d
-
-# CHECK: shrxl	%r18d, 291(%r28,%r29,4), %r22d
-# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         shrxl	%r18d, 291(%r28,%r29,4), %r22d
-
-# CHECK: shrxq	%r19, %r23, %r27
-# CHECK: encoding: [0x62,0x6a,0xe7,0x00,0xf7,0xdf]
-         shrxq	%r19, %r23, %r27
-
-# CHECK: shrxq	%r19, 291(%r28,%r29,4), %r23
-# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         shrxq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/shrx-intel.s b/llvm/test/MC/X86/apx/shrx-intel.s
deleted file mode 100644
index 385c530a1108b..0000000000000
--- a/llvm/test/MC/X86/apx/shrx-intel.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: shrx	r26d, r22d, r18d
-# CHECK: encoding: [0x62,0x6a,0x6f,0x00,0xf7,0xd6]
-         shrx	r26d, r22d, r18d
-
-# CHECK: shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
-
-# CHECK: shrx	r27, r23, r19
-# CHECK: encoding: [0x62,0x6a,0xe7,0x00,0xf7,0xdf]
-         shrx	r27, r23, r19
-
-# CHECK: shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
-# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/vmx-att.s b/llvm/test/MC/X86/apx/vmx-att.s
new file mode 100644
index 0000000000000..a2a497b9b9bc5
--- /dev/null
+++ b/llvm/test/MC/X86/apx/vmx-att.s
@@ -0,0 +1,25 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+
+## invept
+
+# CHECK: invept	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00]
+         invept	291(%r28,%r29,4), %r19
+
+# CHECK: {evex}	invept	123(%rax,%rbx,4), %r9
+# CHECK: encoding: [0x62,0x74,0x7e,0x08,0xf0,0x4c,0x98,0x7b]
+         {evex}	invept	123(%rax,%rbx,4), %r9
+
+## invvpid
+
+# CHECK: invvpid	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00]
+         invvpid	291(%r28,%r29,4), %r19
+
+# CHECK: {evex}	invvpid	123(%rax,%rbx,4), %r9
+# CHECK: encoding: [0x62,0x74,0x7e,0x08,0xf1,0x4c,0x98,0x7b]
+         {evex}	invvpid	123(%rax,%rbx,4), %r9
diff --git a/llvm/test/MC/X86/apx/vmx-intel.s b/llvm/test/MC/X86/apx/vmx-intel.s
new file mode 100644
index 0000000000000..068e5f10aee5f
--- /dev/null
+++ b/llvm/test/MC/X86/apx/vmx-intel.s
@@ -0,0 +1,21 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+## invept
+
+# CHECK: invept	r19, xmmword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00]
+         invept	r19, xmmword ptr [r28 + 4*r29 + 291]
+
+# CHECK: {evex}	invept	r9, xmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7e,0x08,0xf0,0x4c,0x98,0x7b]
+         {evex}	invept	r9, xmmword ptr [rax + 4*rbx + 123]
+
+## invvpid
+
+# CHECK: invvpid	r19, xmmword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x7a,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00]
+         invvpid	r19, xmmword ptr [r28 + 4*r29 + 291]
+
+# CHECK: {evex}	invvpid	r9, xmmword ptr [rax + 4*rbx + 123]
+# CHECK: encoding: [0x62,0x74,0x7e,0x08,0xf1,0x4c,0x98,0x7b]
+         {evex}	invvpid	r9, xmmword ptr [rax + 4*rbx + 123]
diff --git a/llvm/test/MC/X86/apx/wrssd-att.s b/llvm/test/MC/X86/apx/wrssd-att.s
deleted file mode 100644
index 409b3010f5c76..0000000000000
--- a/llvm/test/MC/X86/apx/wrssd-att.s
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-1: error:
-# ERROR-NOT: error:
-# CHECK: wrssd	%r18d, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
-         wrssd	%r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrssd-intel.s b/llvm/test/MC/X86/apx/wrssd-intel.s
deleted file mode 100644
index 1d402f2c51776..0000000000000
--- a/llvm/test/MC/X86/apx/wrssd-intel.s
+++ /dev/null
@@ -1,5 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: wrssd	dword ptr [r28 + 4*r29 + 291], r18d
-# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
-         wrssd	dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/wrssq-att.s b/llvm/test/MC/X86/apx/wrssq-att.s
deleted file mode 100644
index 1f616ac2e4e47..0000000000000
--- a/llvm/test/MC/X86/apx/wrssq-att.s
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-1: error:
-# ERROR-NOT: error:
-# CHECK: wrssq	%r19, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
-         wrssq	%r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrssq-intel.s b/llvm/test/MC/X86/apx/wrssq-intel.s
deleted file mode 100644
index d31dca55ca4a4..0000000000000
--- a/llvm/test/MC/X86/apx/wrssq-intel.s
+++ /dev/null
@@ -1,5 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: wrssq	qword ptr [r28 + 4*r29 + 291], r19
-# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
-         wrssq	qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/wrussd-att.s b/llvm/test/MC/X86/apx/wrussd-att.s
deleted file mode 100644
index 269d9a8aa8586..0000000000000
--- a/llvm/test/MC/X86/apx/wrussd-att.s
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-1: error:
-# ERROR-NOT: error:
-# CHECK: wrussd	%r18d, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
-         wrussd	%r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrussd-intel.s b/llvm/test/MC/X86/apx/wrussd-intel.s
deleted file mode 100644
index fed6eb10d4add..0000000000000
--- a/llvm/test/MC/X86/apx/wrussd-intel.s
+++ /dev/null
@@ -1,5 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: wrussd	dword ptr [r28 + 4*r29 + 291], r18d
-# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
-         wrussd	dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/wrussq-att.s b/llvm/test/MC/X86/apx/wrussq-att.s
deleted file mode 100644
index b41360cd9db04..0000000000000
--- a/llvm/test/MC/X86/apx/wrussq-att.s
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
-# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-# ERROR-COUNT-1: error:
-# ERROR-NOT: error:
-# CHECK: wrussq	%r19, 291(%r28,%r29,4)
-# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
-         wrussq	%r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrussq-intel.s b/llvm/test/MC/X86/apx/wrussq-intel.s
deleted file mode 100644
index a9a96da9d3d1d..0000000000000
--- a/llvm/test/MC/X86/apx/wrussq-intel.s
+++ /dev/null
@@ -1,5 +0,0 @@
-# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-# CHECK: wrussq	qword ptr [r28 + 4*r29 + 291], r19
-# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
-         wrussq	qword ptr [r28 + 4*r29 + 291], r19

From 53edf12e526704cc251b6a6917319c7cb7a653a0 Mon Sep 17 00:00:00 2001
From: Jerry Wu <cheyuw@google.com>
Date: Thu, 4 Jan 2024 03:34:19 +0000
Subject: [PATCH 187/313] [mlir] Add `res()` method to
 `linalg::ContractionOpInterface` (#76539)

In addition to `lhs()` and `rhs()` to return left and right operands,
add `res()` to return the result value.
---
 .../Dialect/Linalg/IR/LinalgInterfaces.td     |  8 ++++
 mlir/unittests/Dialect/CMakeLists.txt         |  1 +
 mlir/unittests/Dialect/Linalg/CMakeLists.txt  |  8 ++++
 .../Dialect/Linalg/LinalgInterfacesTest.cpp   | 43 +++++++++++++++++++
 4 files changed, 60 insertions(+)
 create mode 100644 mlir/unittests/Dialect/Linalg/CMakeLists.txt
 create mode 100644 mlir/unittests/Dialect/Linalg/LinalgInterfacesTest.cpp

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index fbf3f19cde0e9..777d7cfd558d2 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -54,6 +54,14 @@ def LinalgContractionOpInterface : OpInterface<"ContractionOpInterface"> {
       return $_op.getOperation()->getOperand(1);
     }]>,
     InterfaceMethod<
+    /*desc=*/"Returns the result value.",
+    /*retTy=*/"OpResult",
+    /*methodName=*/"res",
+    /*args=*/(ins),
+    /*methodBody=*/[{
+      return $_op.getOperation()->getResult(0);
+    }]>,
+    InterfaceMethod<
     /*desc=*/[{
       Returns whether the given op has indexing maps that correspond to a
       row-major matmul operation.
diff --git a/mlir/unittests/Dialect/CMakeLists.txt b/mlir/unittests/Dialect/CMakeLists.txt
index 2dec4ba3c001e..76b698d1d1a7b 100644
--- a/mlir/unittests/Dialect/CMakeLists.txt
+++ b/mlir/unittests/Dialect/CMakeLists.txt
@@ -8,6 +8,7 @@ target_link_libraries(MLIRDialectTests
 
 add_subdirectory(ArmSME)
 add_subdirectory(Index)
+add_subdirectory(Linalg)
 add_subdirectory(LLVMIR)
 add_subdirectory(MemRef)
 add_subdirectory(SCF)
diff --git a/mlir/unittests/Dialect/Linalg/CMakeLists.txt b/mlir/unittests/Dialect/Linalg/CMakeLists.txt
new file mode 100644
index 0000000000000..080caab8d075e
--- /dev/null
+++ b/mlir/unittests/Dialect/Linalg/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_mlir_unittest(MLIRLinalgTests
+  LinalgInterfacesTest.cpp
+)
+target_link_libraries(MLIRLinalgTests
+  PRIVATE
+  MLIRLinalgDialect
+  )
+
diff --git a/mlir/unittests/Dialect/Linalg/LinalgInterfacesTest.cpp b/mlir/unittests/Dialect/Linalg/LinalgInterfacesTest.cpp
new file mode 100644
index 0000000000000..8cc4a5e37c452
--- /dev/null
+++ b/mlir/unittests/Dialect/Linalg/LinalgInterfacesTest.cpp
@@ -0,0 +1,43 @@
+//===- LinalgInterfacesTest.cpp - LinalgInterfaces unit tests ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+
+#include "gtest/gtest.h"
+
+using namespace mlir;
+
+class LinalgInterfacesTest : public ::testing::Test {
+protected:
+  LinalgInterfacesTest() {
+    context.getOrLoadDialect<mlir::linalg::LinalgDialect>();
+  }
+
+  mlir::MLIRContext context;
+};
+
+TEST_F(LinalgInterfacesTest, ContractionOpOperandResultAccessor) {
+  OpBuilder b(&context);
+  SmallVector<int64_t> lhsShape = {1, 2};
+  SmallVector<int64_t> rhsShape = {2, 4};
+  SmallVector<int64_t> resShape = {1, 4};
+  auto lhs = b.create<tensor::EmptyOp>(UnknownLoc::get(&context), lhsShape,
+                                       b.getF32Type());
+  auto rhs = b.create<tensor::EmptyOp>(UnknownLoc::get(&context), rhsShape,
+                                       b.getF32Type());
+  auto out = b.create<tensor::EmptyOp>(UnknownLoc::get(&context), resShape,
+                                       b.getF32Type());
+  Operation *op = b.create<linalg::MatmulOp>(
+      UnknownLoc::get(&context), ValueRange{lhs, rhs}, ValueRange{out});
+  auto contractOp = llvm::cast<linalg::ContractionOpInterface>(op);
+
+  EXPECT_EQ(contractOp.lhs(), op->getOperand(0));
+  EXPECT_EQ(contractOp.rhs(), op->getOperand(1));
+  EXPECT_EQ(contractOp.res(), op->getResult(0));
+}

From cda388c4407e0fe42faa82c015ee77da160ebd25 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 3 Jan 2024 19:36:51 -0800
Subject: [PATCH 188/313] [mlgo] Fix test post PR #76697

Opcode values changed, trivial fix.
---
 llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
index c989d2bca6510..98cf234d83a20 100644
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
@@ -26,7 +26,7 @@
 ; Also, the first eviction problem is significantly less than 300 instructions. Check
 ; that there is a zero value.
 ; Note: we're regex-ing some of the opcodes to avoid test flakyness.
-; CHECK: instructions: 19,{{([0-9]{4})}},16{{([0-9]{2})}},16{{([0-9]{2})}},{{.*}},0,
+; CHECK: instructions: 19,{{([0-9]{4})}},17{{([0-9]{2})}},17{{([0-9]{2})}},{{.*}},0,
 ; Only the candidate virtreg and the 10th LR are included in this problem. Make
 ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s.
 ; There's a limit to how many repetitions can be matched.

From 6ae7f66ff5169ddc5a7b9ab545707042c77e036c Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 3 Jan 2024 20:37:19 -0800
Subject: [PATCH 189/313] [mlir] Add config for PDL (#69927)

Make it so that PDL in pattern rewrites can be optionally disabled.

PDL is still enabled by default and not optional bazel. So this should
be a NOP for most folks, while enabling other to disable.

This only works with tests disabled. With tests enabled this still
compiles but tests fail as there is no lit config to disable tests that
depend on PDL rewrites yet.
---
 mlir/CMakeLists.txt                           |   5 +-
 mlir/examples/minimal-opt/README.md           |   9 +-
 mlir/include/mlir/Config/mlir-config.h.cmake  |   3 +
 .../Conversion/LLVMCommon/TypeConverter.h     |   1 +
 .../mlir/Dialect/Vector/IR/VectorOps.h        |   1 +
 mlir/include/mlir/IR/PDLPatternMatch.h.inc    | 995 ++++++++++++++++++
 mlir/include/mlir/IR/PatternMatch.h           | 935 +---------------
 .../mlir/Transforms/DialectConversion.h       |  15 +
 .../Conversion/ComplexToLibm/CMakeLists.txt   |   3 +-
 mlir/lib/Conversion/MathToLibm/CMakeLists.txt |   1 +
 .../Bufferization/TransformOps/CMakeLists.txt |   1 -
 mlir/lib/IR/CMakeLists.txt                    |   8 +
 mlir/lib/IR/PDL/PDLPatternMatch.cpp           | 133 +++
 mlir/lib/IR/PatternMatch.cpp                  | 119 +--
 mlir/lib/Rewrite/ByteCode.h                   |  36 +
 mlir/lib/Rewrite/CMakeLists.txt               |  32 +-
 mlir/lib/Rewrite/FrozenRewritePatternSet.cpp  |  10 +-
 mlir/lib/Rewrite/PatternApplicator.cpp        |   4 +-
 .../Transforms/Utils/DialectConversion.cpp    |   3 +
 mlir/test/CMakeLists.txt                      |  12 +-
 mlir/test/lib/Rewrite/CMakeLists.txt          |  27 +-
 mlir/test/lib/Tools/PDLL/CMakeLists.txt       |   2 +
 mlir/test/lib/Transforms/CMakeLists.txt       |  14 +-
 mlir/tools/mlir-lsp-server/CMakeLists.txt     |  13 +-
 mlir/tools/mlir-opt/CMakeLists.txt            |  14 +-
 mlir/tools/mlir-opt/mlir-opt.cpp              |  18 +-
 .../llvm-project-overlay/mlir/BUILD.bazel     |   4 +
 27 files changed, 1327 insertions(+), 1091 deletions(-)
 create mode 100644 mlir/include/mlir/IR/PDLPatternMatch.h.inc
 create mode 100644 mlir/lib/IR/PDL/PDLPatternMatch.cpp

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 16ff950089734..2d9f78e03ba76 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -133,6 +133,8 @@ set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
     "Statically link the nvptxlibrary instead of calling ptxas as a subprocess \
     for compiling PTX to cubin")
 
+set(MLIR_ENABLE_PDL_IN_PATTERNMATCH 1 CACHE BOOL "Enable PDL in PatternMatch")
+
 option(MLIR_INCLUDE_TESTS
        "Generate build targets for the MLIR unit tests."
        ${LLVM_INCLUDE_TESTS})
@@ -178,10 +180,9 @@ include_directories( ${MLIR_INCLUDE_DIR})
 # Adding tools/mlir-tblgen here as calling add_tablegen sets some variables like
 # MLIR_TABLEGEN_EXE in PARENT_SCOPE which gets lost if that folder is included
 # from another directory like tools
-add_subdirectory(tools/mlir-tblgen)
 add_subdirectory(tools/mlir-linalg-ods-gen)
 add_subdirectory(tools/mlir-pdll)
-
+add_subdirectory(tools/mlir-tblgen)
 set(MLIR_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_TABLEGEN_TARGET "${MLIR_TABLEGEN_TARGET}" CACHE INTERNAL "")
 set(MLIR_PDLL_TABLEGEN_EXE "${MLIR_PDLL_TABLEGEN_EXE}" CACHE INTERNAL "")
diff --git a/mlir/examples/minimal-opt/README.md b/mlir/examples/minimal-opt/README.md
index b8a455f7a7966..1bc54b8367cc5 100644
--- a/mlir/examples/minimal-opt/README.md
+++ b/mlir/examples/minimal-opt/README.md
@@ -14,10 +14,10 @@ Below are some example measurements taken at the time of the LLVM 17 release,
 using clang-14 on a X86 Ubuntu and [bloaty](https://github.com/google/bloaty).
 
 |                                  | Base   | Os     | Oz     | Os LTO | Oz LTO |
-| :-----------------------------: | ------ | ------ | ------ | ------ | ------ |
-| `mlir-cat`                      | 1018kB | 836KB  | 879KB  | 697KB  | 649KB  |
-| `mlir-minimal-opt`              | 1.54MB | 1.25MB | 1.29MB | 1.10MB | 1.00MB |
-| `mlir-minimal-opt-canonicalize` | 2.24MB | 1.81MB | 1.86MB | 1.62MB | 1.48MB |
+| :------------------------------: | ------ | ------ | ------ | ------ | ------ |
+| `mlir-cat`                       | 1024KB |  840KB |  885KB |  706KB |  657KB |
+| `mlir-minimal-opt`               | 1.62MB | 1.32MB | 1.36MB | 1.17MB | 1.07MB |
+| `mlir-minimal-opt-canonicalize`  | 1.83MB | 1.40MB | 1.45MB | 1.25MB | 1.14MB |
 
 Base configuration:
 
@@ -32,6 +32,7 @@ cmake ../llvm/ -G Ninja \
    -DCMAKE_CXX_COMPILER=clang++ \
    -DLLVM_ENABLE_LLD=ON \
    -DLLVM_ENABLE_BACKTRACES=OFF \
+   -DMLIR_ENABLE_PDL_IN_PATTERNMATCH=OFF \
    -DCMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=-Wl,-icf=all
 ```
 
diff --git a/mlir/include/mlir/Config/mlir-config.h.cmake b/mlir/include/mlir/Config/mlir-config.h.cmake
index efa77b2e5ce5d..e152a36c0ce0c 100644
--- a/mlir/include/mlir/Config/mlir-config.h.cmake
+++ b/mlir/include/mlir/Config/mlir-config.h.cmake
@@ -26,4 +26,7 @@
    numeric seed that is passed to the random number generator. */
 #cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}
 
+/* If set, enables PDL usage. */
+#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
 #endif
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
index 74f9c977b7028..e228229302cff 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
@@ -15,6 +15,7 @@
 #define MLIR_CONVERSION_LLVMCOMMON_TYPECONVERTER_H
 
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
index a28b27e4e1581..4603953cb40fa 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
@@ -29,6 +29,7 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 
 // Pull in all enum type definitions and utility function declarations.
diff --git a/mlir/include/mlir/IR/PDLPatternMatch.h.inc b/mlir/include/mlir/IR/PDLPatternMatch.h.inc
new file mode 100644
index 0000000000000..a215da8cb6431
--- /dev/null
+++ b/mlir/include/mlir/IR/PDLPatternMatch.h.inc
@@ -0,0 +1,995 @@
+//===- PDLPatternMatch.h - PDLPatternMatcher classes -------==---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_PDLPATTERNMATCH_H
+#define MLIR_IR_PDLPATTERNMATCH_H
+
+#include "mlir/Config/mlir-config.h"
+
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+
+namespace mlir {
+//===----------------------------------------------------------------------===//
+// PDL Patterns
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// PDLValue
+
+/// Storage type of byte-code interpreter values. These are passed to constraint
+/// functions as arguments.
+class PDLValue {
+public:
+  /// The underlying kind of a PDL value.
+  enum class Kind { Attribute, Operation, Type, TypeRange, Value, ValueRange };
+
+  /// Construct a new PDL value.
+  PDLValue(const PDLValue &other) = default;
+  PDLValue(std::nullptr_t = nullptr) {}
+  PDLValue(Attribute value)
+      : value(value.getAsOpaquePointer()), kind(Kind::Attribute) {}
+  PDLValue(Operation *value) : value(value), kind(Kind::Operation) {}
+  PDLValue(Type value) : value(value.getAsOpaquePointer()), kind(Kind::Type) {}
+  PDLValue(TypeRange *value) : value(value), kind(Kind::TypeRange) {}
+  PDLValue(Value value)
+      : value(value.getAsOpaquePointer()), kind(Kind::Value) {}
+  PDLValue(ValueRange *value) : value(value), kind(Kind::ValueRange) {}
+
+  /// Returns true if the type of the held value is `T`.
+  template <typename T>
+  bool isa() const {
+    assert(value && "isa<> used on a null value");
+    return kind == getKindOf<T>();
+  }
+
+  /// Attempt to dynamically cast this value to type `T`, returns null if this
+  /// value is not an instance of `T`.
+  template <typename T,
+            typename ResultT = std::conditional_t<
+                std::is_convertible<T, bool>::value, T, std::optional<T>>>
+  ResultT dyn_cast() const {
+    return isa<T>() ? castImpl<T>() : ResultT();
+  }
+
+  /// Cast this value to type `T`, asserts if this value is not an instance of
+  /// `T`.
+  template <typename T>
+  T cast() const {
+    assert(isa<T>() && "expected value to be of type `T`");
+    return castImpl<T>();
+  }
+
+  /// Get an opaque pointer to the value.
+  const void *getAsOpaquePointer() const { return value; }
+
+  /// Return if this value is null or not.
+  explicit operator bool() const { return value; }
+
+  /// Return the kind of this value.
+  Kind getKind() const { return kind; }
+
+  /// Print this value to the provided output stream.
+  void print(raw_ostream &os) const;
+
+  /// Print the specified value kind to an output stream.
+  static void print(raw_ostream &os, Kind kind);
+
+private:
+  /// Find the index of a given type in a range of other types.
+  template <typename...>
+  struct index_of_t;
+  template <typename T, typename... R>
+  struct index_of_t<T, T, R...> : std::integral_constant<size_t, 0> {};
+  template <typename T, typename F, typename... R>
+  struct index_of_t<T, F, R...>
+      : std::integral_constant<size_t, 1 + index_of_t<T, R...>::value> {};
+
+  /// Return the kind used for the given T.
+  template <typename T>
+  static Kind getKindOf() {
+    return static_cast<Kind>(index_of_t<T, Attribute, Operation *, Type,
+                                        TypeRange, Value, ValueRange>::value);
+  }
+
+  /// The internal implementation of `cast`, that returns the underlying value
+  /// as the given type `T`.
+  template <typename T>
+  std::enable_if_t<llvm::is_one_of<T, Attribute, Type, Value>::value, T>
+  castImpl() const {
+    return T::getFromOpaquePointer(value);
+  }
+  template <typename T>
+  std::enable_if_t<llvm::is_one_of<T, TypeRange, ValueRange>::value, T>
+  castImpl() const {
+    return *reinterpret_cast<T *>(const_cast<void *>(value));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_pointer<T>::value, T> castImpl() const {
+    return reinterpret_cast<T>(const_cast<void *>(value));
+  }
+
+  /// The internal opaque representation of a PDLValue.
+  const void *value{nullptr};
+  /// The kind of the opaque value.
+  Kind kind{Kind::Attribute};
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, PDLValue value) {
+  value.print(os);
+  return os;
+}
+
+inline raw_ostream &operator<<(raw_ostream &os, PDLValue::Kind kind) {
+  PDLValue::print(os, kind);
+  return os;
+}
+
+//===----------------------------------------------------------------------===//
+// PDLResultList
+
+/// The class represents a list of PDL results, returned by a native rewrite
+/// method. It provides the mechanism with which to pass PDLValues back to the
+/// PDL bytecode.
+class PDLResultList {
+public:
+  /// Push a new Attribute value onto the result list.
+  void push_back(Attribute value) { results.push_back(value); }
+
+  /// Push a new Operation onto the result list.
+  void push_back(Operation *value) { results.push_back(value); }
+
+  /// Push a new Type onto the result list.
+  void push_back(Type value) { results.push_back(value); }
+
+  /// Push a new TypeRange onto the result list.
+  void push_back(TypeRange value) {
+    // The lifetime of a TypeRange can't be guaranteed, so we'll need to
+    // allocate a storage for it.
+    llvm::OwningArrayRef<Type> storage(value.size());
+    llvm::copy(value, storage.begin());
+    allocatedTypeRanges.emplace_back(std::move(storage));
+    typeRanges.push_back(allocatedTypeRanges.back());
+    results.push_back(&typeRanges.back());
+  }
+  void push_back(ValueTypeRange<OperandRange> value) {
+    typeRanges.push_back(value);
+    results.push_back(&typeRanges.back());
+  }
+  void push_back(ValueTypeRange<ResultRange> value) {
+    typeRanges.push_back(value);
+    results.push_back(&typeRanges.back());
+  }
+
+  /// Push a new Value onto the result list.
+  void push_back(Value value) { results.push_back(value); }
+
+  /// Push a new ValueRange onto the result list.
+  void push_back(ValueRange value) {
+    // The lifetime of a ValueRange can't be guaranteed, so we'll need to
+    // allocate a storage for it.
+    llvm::OwningArrayRef<Value> storage(value.size());
+    llvm::copy(value, storage.begin());
+    allocatedValueRanges.emplace_back(std::move(storage));
+    valueRanges.push_back(allocatedValueRanges.back());
+    results.push_back(&valueRanges.back());
+  }
+  void push_back(OperandRange value) {
+    valueRanges.push_back(value);
+    results.push_back(&valueRanges.back());
+  }
+  void push_back(ResultRange value) {
+    valueRanges.push_back(value);
+    results.push_back(&valueRanges.back());
+  }
+
+protected:
+  /// Create a new result list with the expected number of results.
+  PDLResultList(unsigned maxNumResults) {
+    // For now just reserve enough space for all of the results. We could do
+    // separate counts per range type, but it isn't really worth it unless there
+    // are a "large" number of results.
+    typeRanges.reserve(maxNumResults);
+    valueRanges.reserve(maxNumResults);
+  }
+
+  /// The PDL results held by this list.
+  SmallVector<PDLValue> results;
+  /// Memory used to store ranges held by the list.
+  SmallVector<TypeRange> typeRanges;
+  SmallVector<ValueRange> valueRanges;
+  /// Memory allocated to store ranges in the result list whose lifetime was
+  /// generated in the native function.
+  SmallVector<llvm::OwningArrayRef<Type>> allocatedTypeRanges;
+  SmallVector<llvm::OwningArrayRef<Value>> allocatedValueRanges;
+};
+
+//===----------------------------------------------------------------------===//
+// PDLPatternConfig
+
+/// An individual configuration for a pattern, which can be accessed by native
+/// functions via the PDLPatternConfigSet. This allows for injecting additional
+/// configuration into PDL patterns that is specific to certain compilation
+/// flows.
+class PDLPatternConfig {
+public:
+  virtual ~PDLPatternConfig() = default;
+
+  /// Hooks that are invoked at the beginning and end of a rewrite of a matched
+  /// pattern. These can be used to setup any specific state necessary for the
+  /// rewrite.
+  virtual void notifyRewriteBegin(PatternRewriter &rewriter) {}
+  virtual void notifyRewriteEnd(PatternRewriter &rewriter) {}
+
+  /// Return the TypeID that represents this configuration.
+  TypeID getTypeID() const { return id; }
+
+protected:
+  PDLPatternConfig(TypeID id) : id(id) {}
+
+private:
+  TypeID id;
+};
+
+/// This class provides a base class for users implementing a type of pattern
+/// configuration.
+template <typename T>
+class PDLPatternConfigBase : public PDLPatternConfig {
+public:
+  /// Support LLVM style casting.
+  static bool classof(const PDLPatternConfig *config) {
+    return config->getTypeID() == getConfigID();
+  }
+
+  /// Return the type id used for this configuration.
+  static TypeID getConfigID() { return TypeID::get<T>(); }
+
+protected:
+  PDLPatternConfigBase() : PDLPatternConfig(getConfigID()) {}
+};
+
+/// This class contains a set of configurations for a specific pattern.
+/// Configurations are uniqued by TypeID, meaning that only one configuration of
+/// each type is allowed.
+class PDLPatternConfigSet {
+public:
+  PDLPatternConfigSet() = default;
+
+  /// Construct a set with the given configurations.
+  template <typename... ConfigsT>
+  PDLPatternConfigSet(ConfigsT &&...configs) {
+    (addConfig(std::forward<ConfigsT>(configs)), ...);
+  }
+
+  /// Get the configuration defined by the given type. Asserts that the
+  /// configuration of the provided type exists.
+  template <typename T>
+  const T &get() const {
+    const T *config = tryGet<T>();
+    assert(config && "configuration not found");
+    return *config;
+  }
+
+  /// Get the configuration defined by the given type, returns nullptr if the
+  /// configuration does not exist.
+  template <typename T>
+  const T *tryGet() const {
+    for (const auto &configIt : configs)
+      if (const T *config = dyn_cast<T>(configIt.get()))
+        return config;
+    return nullptr;
+  }
+
+  /// Notify the configurations within this set at the beginning or end of a
+  /// rewrite of a matched pattern.
+  void notifyRewriteBegin(PatternRewriter &rewriter) {
+    for (const auto &config : configs)
+      config->notifyRewriteBegin(rewriter);
+  }
+  void notifyRewriteEnd(PatternRewriter &rewriter) {
+    for (const auto &config : configs)
+      config->notifyRewriteEnd(rewriter);
+  }
+
+protected:
+  /// Add a configuration to the set.
+  template <typename T>
+  void addConfig(T &&config) {
+    assert(!tryGet<std::decay_t<T>>() && "configuration already exists");
+    configs.emplace_back(
+        std::make_unique<std::decay_t<T>>(std::forward<T>(config)));
+  }
+
+  /// The set of configurations for this pattern. This uses a vector instead of
+  /// a map with the expectation that the number of configurations per set is
+  /// small (<= 1).
+  SmallVector<std::unique_ptr<PDLPatternConfig>> configs;
+};
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+
+/// A generic PDL pattern constraint function. This function applies a
+/// constraint to a given set of opaque PDLValue entities. Returns success if
+/// the constraint successfully held, failure otherwise.
+using PDLConstraintFunction =
+    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
+/// A native PDL rewrite function. This function performs a rewrite on the
+/// given set of values. Any results from this rewrite that should be passed
+/// back to PDL should be added to the provided result list. This method is only
+/// invoked when the corresponding match was successful. Returns failure if an
+/// invariant of the rewrite was broken (certain rewriters may recover from
+/// partial pattern application).
+using PDLRewriteFunction = std::function<LogicalResult(
+    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
+
+namespace detail {
+namespace pdl_function_builder {
+/// A utility variable that always resolves to false. This is useful for static
+/// asserts that are always false, but only should fire in certain templated
+/// constructs. For example, if a templated function should never be called, the
+/// function could be defined as:
+///
+/// template <typename T>
+/// void foo() {
+///  static_assert(always_false<T>, "This function should never be called");
+/// }
+///
+template <class... T>
+constexpr bool always_false = false;
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Type Processing
+//===----------------------------------------------------------------------===//
+
+/// This struct provides a convenient way to determine how to process a given
+/// type as either a PDL parameter, or a result value. This allows for
+/// supporting complex types in constraint and rewrite functions, without
+/// requiring the user to hand-write the necessary glue code themselves.
+/// Specializations of this class should implement the following methods to
+/// enable support as a PDL argument or result type:
+///
+///   static LogicalResult verifyAsArg(
+///     function_ref<LogicalResult(const Twine &)> errorFn, PDLValue pdlValue,
+///     size_t argIdx);
+///
+///     * This method verifies that the given PDLValue is valid for use as a
+///       value of `T`.
+///
+///   static T processAsArg(PDLValue pdlValue);
+///
+///     *  This method processes the given PDLValue as a value of `T`.
+///
+///   static void processAsResult(PatternRewriter &, PDLResultList &results,
+///                               const T &value);
+///
+///     *  This method processes the given value of `T` as the result of a
+///        function invocation. The method should package the value into an
+///        appropriate form and append it to the given result list.
+///
+/// If the type `T` is based on a higher order value, consider using
+/// `ProcessPDLValueBasedOn` as a base class of the specialization to simplify
+/// the implementation.
+///
+template <typename T, typename Enable = void>
+struct ProcessPDLValue;
+
+/// This struct provides a simplified model for processing types that are based
+/// on another type, e.g. APInt is based on the handling for IntegerAttr. This
+/// allows for building the necessary processing functions on top of the base
+/// value instead of a PDLValue. Derived users should implement the following
+/// (which subsume the ProcessPDLValue variants):
+///
+///   static LogicalResult verifyAsArg(
+///     function_ref<LogicalResult(const Twine &)> errorFn,
+///     const BaseT &baseValue, size_t argIdx);
+///
+///     * This method verifies that the given PDLValue is valid for use as a
+///       value of `T`.
+///
+///   static T processAsArg(BaseT baseValue);
+///
+///     *  This method processes the given base value as a value of `T`.
+///
+template <typename T, typename BaseT>
+struct ProcessPDLValueBasedOn {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              PDLValue pdlValue, size_t argIdx) {
+    // Verify the base class before continuing.
+    if (failed(ProcessPDLValue<BaseT>::verifyAsArg(errorFn, pdlValue, argIdx)))
+      return failure();
+    return ProcessPDLValue<T>::verifyAsArg(
+        errorFn, ProcessPDLValue<BaseT>::processAsArg(pdlValue), argIdx);
+  }
+  static T processAsArg(PDLValue pdlValue) {
+    return ProcessPDLValue<T>::processAsArg(
+        ProcessPDLValue<BaseT>::processAsArg(pdlValue));
+  }
+
+  /// Explicitly add the expected parent API to ensure the parent class
+  /// implements the necessary API (and doesn't implicitly inherit it from
+  /// somewhere else).
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn, BaseT value,
+              size_t argIdx) {
+    return success();
+  }
+  static T processAsArg(BaseT baseValue);
+};
+
+/// This struct provides a simplified model for processing types that have
+/// "builtin" PDLValue support:
+///   * Attribute, Operation *, Type, TypeRange, ValueRange
+template <typename T>
+struct ProcessBuiltinPDLValue {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              PDLValue pdlValue, size_t argIdx) {
+    if (pdlValue)
+      return success();
+    return errorFn("expected a non-null value for argument " + Twine(argIdx) +
+                   " of type: " + llvm::getTypeName<T>());
+  }
+
+  static T processAsArg(PDLValue pdlValue) { return pdlValue.cast<T>(); }
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              T value) {
+    results.push_back(value);
+  }
+};
+
+/// This struct provides a simplified model for processing types that inherit
+/// from builtin PDLValue types. For example, derived attributes like
+/// IntegerAttr, derived types like IntegerType, derived operations like
+/// ModuleOp, Interfaces, etc.
+template <typename T, typename BaseT>
+struct ProcessDerivedPDLValue : public ProcessPDLValueBasedOn<T, BaseT> {
+  static LogicalResult
+  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
+              BaseT baseValue, size_t argIdx) {
+    return TypeSwitch<BaseT, LogicalResult>(baseValue)
+        .Case([&](T) { return success(); })
+        .Default([&](BaseT) {
+          return errorFn("expected argument " + Twine(argIdx) +
+                         " to be of type: " + llvm::getTypeName<T>());
+        });
+  }
+  using ProcessPDLValueBasedOn<T, BaseT>::verifyAsArg;
+
+  static T processAsArg(BaseT baseValue) {
+    return baseValue.template cast<T>();
+  }
+  using ProcessPDLValueBasedOn<T, BaseT>::processAsArg;
+
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              T value) {
+    results.push_back(value);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Attribute
+
+template <>
+struct ProcessPDLValue<Attribute> : public ProcessBuiltinPDLValue<Attribute> {};
+template <typename T>
+struct ProcessPDLValue<T,
+                       std::enable_if_t<std::is_base_of<Attribute, T>::value>>
+    : public ProcessDerivedPDLValue<T, Attribute> {};
+
+/// Handling for various Attribute value types.
+template <>
+struct ProcessPDLValue<StringRef>
+    : public ProcessPDLValueBasedOn<StringRef, StringAttr> {
+  static StringRef processAsArg(StringAttr value) { return value.getValue(); }
+  using ProcessPDLValueBasedOn<StringRef, StringAttr>::processAsArg;
+
+  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
+                              StringRef value) {
+    results.push_back(rewriter.getStringAttr(value));
+  }
+};
+template <>
+struct ProcessPDLValue<std::string>
+    : public ProcessPDLValueBasedOn<std::string, StringAttr> {
+  template <typename T>
+  static std::string processAsArg(T value) {
+    static_assert(always_false<T>,
+                  "`std::string` arguments require a string copy, use "
+                  "`StringRef` for string-like arguments instead");
+    return {};
+  }
+  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
+                              StringRef value) {
+    results.push_back(rewriter.getStringAttr(value));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Operation
+
+template <>
+struct ProcessPDLValue<Operation *>
+    : public ProcessBuiltinPDLValue<Operation *> {};
+template <typename T>
+struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<OpState, T>::value>>
+    : public ProcessDerivedPDLValue<T, Operation *> {
+  static T processAsArg(Operation *value) { return cast<T>(value); }
+};
+
+//===----------------------------------------------------------------------===//
+// Type
+
+template <>
+struct ProcessPDLValue<Type> : public ProcessBuiltinPDLValue<Type> {};
+template <typename T>
+struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<Type, T>::value>>
+    : public ProcessDerivedPDLValue<T, Type> {};
+
+//===----------------------------------------------------------------------===//
+// TypeRange
+
+template <>
+struct ProcessPDLValue<TypeRange> : public ProcessBuiltinPDLValue<TypeRange> {};
+template <>
+struct ProcessPDLValue<ValueTypeRange<OperandRange>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ValueTypeRange<OperandRange> types) {
+    results.push_back(types);
+  }
+};
+template <>
+struct ProcessPDLValue<ValueTypeRange<ResultRange>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ValueTypeRange<ResultRange> types) {
+    results.push_back(types);
+  }
+};
+template <unsigned N>
+struct ProcessPDLValue<SmallVector<Type, N>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              SmallVector<Type, N> values) {
+    results.push_back(TypeRange(values));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Value
+
+template <>
+struct ProcessPDLValue<Value> : public ProcessBuiltinPDLValue<Value> {};
+
+//===----------------------------------------------------------------------===//
+// ValueRange
+
+template <>
+struct ProcessPDLValue<ValueRange> : public ProcessBuiltinPDLValue<ValueRange> {
+};
+template <>
+struct ProcessPDLValue<OperandRange> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              OperandRange values) {
+    results.push_back(values);
+  }
+};
+template <>
+struct ProcessPDLValue<ResultRange> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              ResultRange values) {
+    results.push_back(values);
+  }
+};
+template <unsigned N>
+struct ProcessPDLValue<SmallVector<Value, N>> {
+  static void processAsResult(PatternRewriter &, PDLResultList &results,
+                              SmallVector<Value, N> values) {
+    results.push_back(ValueRange(values));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Argument Handling
+//===----------------------------------------------------------------------===//
+
+/// Validate the given PDLValues match the constraints defined by the argument
+/// types of the given function. In the case of failure, a match failure
+/// diagnostic is emitted.
+/// FIXME: This should be completely removed in favor of `assertArgs`, but PDL
+/// does not currently preserve Constraint application ordering.
+template <typename PDLFnT, std::size_t... I>
+LogicalResult verifyAsArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
+                           std::index_sequence<I...>) {
+  using FnTraitsT = llvm::function_traits<PDLFnT>;
+
+  auto errorFn = [&](const Twine &msg) {
+    return rewriter.notifyMatchFailure(rewriter.getUnknownLoc(), msg);
+  };
+  return success(
+      (succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                     verifyAsArg(errorFn, values[I], I)) &&
+       ...));
+}
+
+/// Assert that the given PDLValues match the constraints defined by the
+/// arguments of the given function. In the case of failure, a fatal error
+/// is emitted.
+template <typename PDLFnT, std::size_t... I>
+void assertArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
+                std::index_sequence<I...>) {
+  // We only want to do verification in debug builds, same as with `assert`.
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  using FnTraitsT = llvm::function_traits<PDLFnT>;
+  auto errorFn = [&](const Twine &msg) -> LogicalResult {
+    llvm::report_fatal_error(msg);
+  };
+  (void)errorFn;
+  assert((succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                        verifyAsArg(errorFn, values[I], I)) &&
+          ...));
+#endif
+  (void)values;
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Function Builder: Results Handling
+//===----------------------------------------------------------------------===//
+
+/// Store a single result within the result list.
+template <typename T>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results, T &&value) {
+  ProcessPDLValue<T>::processAsResult(rewriter, results,
+                                      std::forward<T>(value));
+  return success();
+}
+
+/// Store a std::pair<> as individual results within the result list.
+template <typename T1, typename T2>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    std::pair<T1, T2> &&pair) {
+  if (failed(processResults(rewriter, results, std::move(pair.first))) ||
+      failed(processResults(rewriter, results, std::move(pair.second))))
+    return failure();
+  return success();
+}
+
+/// Store a std::tuple<> as individual results within the result list.
+template <typename... Ts>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    std::tuple<Ts...> &&tuple) {
+  auto applyFn = [&](auto &&...args) {
+    return (succeeded(processResults(rewriter, results, std::move(args))) &&
+            ...);
+  };
+  return success(std::apply(applyFn, std::move(tuple)));
+}
+
+/// Handle LogicalResult propagation.
+inline LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    LogicalResult &&result) {
+  return result;
+}
+template <typename T>
+static LogicalResult processResults(PatternRewriter &rewriter,
+                                    PDLResultList &results,
+                                    FailureOr<T> &&result) {
+  if (failed(result))
+    return failure();
+  return processResults(rewriter, results, std::move(*result));
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Constraint Builder
+//===----------------------------------------------------------------------===//
+
+/// Process the arguments of a native constraint and invoke it.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+typename FnTraitsT::result_t
+processArgsAndInvokeConstraint(PDLFnT &fn, PatternRewriter &rewriter,
+                               ArrayRef<PDLValue> values,
+                               std::index_sequence<I...>) {
+  return fn(
+      rewriter,
+      (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
+          values[I]))...);
+}
+
+/// Build a constraint function from the given function `ConstraintFnT`. This
+/// allows for enabling the user to define simpler, more direct constraint
+/// functions without needing to handle the low-level PDL goop.
+///
+/// If the constraint function is already in the correct form, we just forward
+/// it directly.
+template <typename ConstraintFnT>
+std::enable_if_t<
+    std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
+    PDLConstraintFunction>
+buildConstraintFn(ConstraintFnT &&constraintFn) {
+  return std::forward<ConstraintFnT>(constraintFn);
+}
+/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
+/// we desire.
+template <typename ConstraintFnT>
+std::enable_if_t<
+    !std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
+    PDLConstraintFunction>
+buildConstraintFn(ConstraintFnT &&constraintFn) {
+  return [constraintFn = std::forward<ConstraintFnT>(constraintFn)](
+             PatternRewriter &rewriter,
+             ArrayRef<PDLValue> values) -> LogicalResult {
+    auto argIndices = std::make_index_sequence<
+        llvm::function_traits<ConstraintFnT>::num_args - 1>();
+    if (failed(verifyAsArgs<ConstraintFnT>(rewriter, values, argIndices)))
+      return failure();
+    return processArgsAndInvokeConstraint(constraintFn, rewriter, values,
+                                          argIndices);
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// PDL Rewrite Builder
+//===----------------------------------------------------------------------===//
+
+/// Process the arguments of a native rewrite and invoke it.
+/// This overload handles the case of no return values.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+std::enable_if_t<std::is_same<typename FnTraitsT::result_t, void>::value,
+                 LogicalResult>
+processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
+                            PDLResultList &, ArrayRef<PDLValue> values,
+                            std::index_sequence<I...>) {
+  fn(rewriter,
+     (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
+         values[I]))...);
+  return success();
+}
+/// This overload handles the case of return values, which need to be packaged
+/// into the result list.
+template <typename PDLFnT, std::size_t... I,
+          typename FnTraitsT = llvm::function_traits<PDLFnT>>
+std::enable_if_t<!std::is_same<typename FnTraitsT::result_t, void>::value,
+                 LogicalResult>
+processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
+                            PDLResultList &results, ArrayRef<PDLValue> values,
+                            std::index_sequence<I...>) {
+  return processResults(
+      rewriter, results,
+      fn(rewriter, (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
+                        processAsArg(values[I]))...));
+  (void)values;
+}
+
+/// Build a rewrite function from the given function `RewriteFnT`. This
+/// allows for enabling the user to define simpler, more direct rewrite
+/// functions without needing to handle the low-level PDL goop.
+///
+/// If the rewrite function is already in the correct form, we just forward
+/// it directly.
+template <typename RewriteFnT>
+std::enable_if_t<std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
+                 PDLRewriteFunction>
+buildRewriteFn(RewriteFnT &&rewriteFn) {
+  return std::forward<RewriteFnT>(rewriteFn);
+}
+/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
+/// we desire.
+template <typename RewriteFnT>
+std::enable_if_t<!std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
+                 PDLRewriteFunction>
+buildRewriteFn(RewriteFnT &&rewriteFn) {
+  return [rewriteFn = std::forward<RewriteFnT>(rewriteFn)](
+             PatternRewriter &rewriter, PDLResultList &results,
+             ArrayRef<PDLValue> values) {
+    auto argIndices =
+        std::make_index_sequence<llvm::function_traits<RewriteFnT>::num_args -
+                                 1>();
+    assertArgs<RewriteFnT>(rewriter, values, argIndices);
+    return processArgsAndInvokeRewrite(rewriteFn, rewriter, results, values,
+                                       argIndices);
+  };
+}
+
+} // namespace pdl_function_builder
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+
+/// This class contains all of the necessary data for a set of PDL patterns, or
+/// pattern rewrites specified in the form of the PDL dialect. This PDL module
+/// contained by this pattern may contain any number of `pdl.pattern`
+/// operations.
+class PDLPatternModule {
+public:
+  PDLPatternModule() = default;
+
+  /// Construct a PDL pattern with the given module and configurations.
+  PDLPatternModule(OwningOpRef<ModuleOp> module)
+      : pdlModule(std::move(module)) {}
+  template <typename... ConfigsT>
+  PDLPatternModule(OwningOpRef<ModuleOp> module, ConfigsT &&...patternConfigs)
+      : PDLPatternModule(std::move(module)) {
+    auto configSet = std::make_unique<PDLPatternConfigSet>(
+        std::forward<ConfigsT>(patternConfigs)...);
+    attachConfigToPatterns(*pdlModule, *configSet);
+    configs.emplace_back(std::move(configSet));
+  }
+
+  /// Merge the state in `other` into this pattern module.
+  void mergeIn(PDLPatternModule &&other);
+
+  /// Return the internal PDL module of this pattern.
+  ModuleOp getModule() { return pdlModule.get(); }
+
+  /// Return the MLIR context of this pattern.
+  MLIRContext *getContext() { return getModule()->getContext(); }
+
+  //===--------------------------------------------------------------------===//
+  // Function Registry
+
+  /// Register a constraint function with PDL. A constraint function may be
+  /// specified in one of two ways:
+  ///
+  ///   * `LogicalResult (PatternRewriter &, ArrayRef<PDLValue>)`
+  ///
+  ///   In this overload the arguments of the constraint function are passed via
+  ///   the low-level PDLValue form.
+  ///
+  ///   * `LogicalResult (PatternRewriter &, ValueTs... values)`
+  ///
+  ///   In this form the arguments of the constraint function are passed via the
+  ///   expected high level C++ type. In this form, the framework will
+  ///   automatically unwrap PDLValues and convert them to the expected ValueTs.
+  ///   For example, if the constraint function accepts a `Operation *`, the
+  ///   framework will automatically cast the input PDLValue. In the case of a
+  ///   `StringRef`, the framework will automatically unwrap the argument as a
+  ///   StringAttr and pass the underlying string value. To see the full list of
+  ///   supported types, or to see how to add handling for custom types, view
+  ///   the definition of `ProcessPDLValue` above.
+  void registerConstraintFunction(StringRef name,
+                                  PDLConstraintFunction constraintFn);
+  template <typename ConstraintFnT>
+  void registerConstraintFunction(StringRef name,
+                                  ConstraintFnT &&constraintFn) {
+    registerConstraintFunction(name,
+                               detail::pdl_function_builder::buildConstraintFn(
+                                   std::forward<ConstraintFnT>(constraintFn)));
+  }
+
+  /// Register a rewrite function with PDL. A rewrite function may be specified
+  /// in one of two ways:
+  ///
+  ///   * `void (PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)`
+  ///
+  ///   In this overload the arguments of the constraint function are passed via
+  ///   the low-level PDLValue form, and the results are manually appended to
+  ///   the given result list.
+  ///
+  ///   * `ResultT (PatternRewriter &, ValueTs... values)`
+  ///
+  ///   In this form the arguments and result of the rewrite function are passed
+  ///   via the expected high level C++ type. In this form, the framework will
+  ///   automatically unwrap the PDLValues arguments and convert them to the
+  ///   expected ValueTs. It will also automatically handle the processing and
+  ///   packaging of the result value to the result list. For example, if the
+  ///   rewrite function takes a `Operation *`, the framework will automatically
+  ///   cast the input PDLValue. In the case of a `StringRef`, the framework
+  ///   will automatically unwrap the argument as a StringAttr and pass the
+  ///   underlying string value. In the reverse case, if the rewrite returns a
+  ///   StringRef or std::string, it will automatically package this as a
+  ///   StringAttr and append it to the result list. To see the full list of
+  ///   supported types, or to see how to add handling for custom types, view
+  ///   the definition of `ProcessPDLValue` above.
+  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn);
+  template <typename RewriteFnT>
+  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {
+    registerRewriteFunction(name, detail::pdl_function_builder::buildRewriteFn(
+                                      std::forward<RewriteFnT>(rewriteFn)));
+  }
+
+  /// Return the set of the registered constraint functions.
+  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
+    return constraintFunctions;
+  }
+  llvm::StringMap<PDLConstraintFunction> takeConstraintFunctions() {
+    return constraintFunctions;
+  }
+  /// Return the set of the registered rewrite functions.
+  const llvm::StringMap<PDLRewriteFunction> &getRewriteFunctions() const {
+    return rewriteFunctions;
+  }
+  llvm::StringMap<PDLRewriteFunction> takeRewriteFunctions() {
+    return rewriteFunctions;
+  }
+
+  /// Return the set of the registered pattern configs.
+  SmallVector<std::unique_ptr<PDLPatternConfigSet>> takeConfigs() {
+    return std::move(configs);
+  }
+  DenseMap<Operation *, PDLPatternConfigSet *> takeConfigMap() {
+    return std::move(configMap);
+  }
+
+  /// Clear out the patterns and functions within this module.
+  void clear() {
+    pdlModule = nullptr;
+    constraintFunctions.clear();
+    rewriteFunctions.clear();
+  }
+
+private:
+  /// Attach the given pattern config set to the patterns defined within the
+  /// given module.
+  void attachConfigToPatterns(ModuleOp module, PDLPatternConfigSet &configSet);
+
+  /// The module containing the `pdl.pattern` operations.
+  OwningOpRef<ModuleOp> pdlModule;
+
+  /// The set of configuration sets referenced by patterns within `pdlModule`.
+  SmallVector<std::unique_ptr<PDLPatternConfigSet>> configs;
+  DenseMap<Operation *, PDLPatternConfigSet *> configMap;
+
+  /// The external functions referenced from within the PDL module.
+  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
+  llvm::StringMap<PDLRewriteFunction> rewriteFunctions;
+};
+} // namespace mlir
+
+#else
+
+namespace mlir {
+// Stubs for when PDL in pattern rewrites is not enabled.
+
+class PDLValue {
+public:
+  template <typename T>
+  T dyn_cast() const {
+    return nullptr;
+  }
+};
+class PDLResultList {};
+using PDLConstraintFunction =
+    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
+using PDLRewriteFunction = std::function<LogicalResult(
+    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
+
+class PDLPatternModule {
+public:
+  PDLPatternModule() = default;
+
+  PDLPatternModule(OwningOpRef<ModuleOp> /*module*/) {}
+  MLIRContext *getContext() {
+    llvm_unreachable("Error: PDL for rewrites when PDL is not enabled");
+  }
+  void mergeIn(PDLPatternModule &&other) {}
+  void clear() {}
+  template <typename ConstraintFnT>
+  void registerConstraintFunction(StringRef name,
+                                  ConstraintFnT &&constraintFn) {}
+  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn) {}
+  template <typename RewriteFnT>
+  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {}
+  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
+    return constraintFunctions;
+  }
+
+private:
+  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
+};
+
+} // namespace mlir
+#endif
+
+#endif // MLIR_IR_PDLPATTERNMATCH_H
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 6625ef553eba2..9b4fa65bff49e 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -735,932 +735,12 @@ class PatternRewriter : public RewriterBase {
   virtual bool canRecoverFromRewriteFailure() const { return false; }
 };
 
-//===----------------------------------------------------------------------===//
-// PDL Patterns
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// PDLValue
-
-/// Storage type of byte-code interpreter values. These are passed to constraint
-/// functions as arguments.
-class PDLValue {
-public:
-  /// The underlying kind of a PDL value.
-  enum class Kind { Attribute, Operation, Type, TypeRange, Value, ValueRange };
-
-  /// Construct a new PDL value.
-  PDLValue(const PDLValue &other) = default;
-  PDLValue(std::nullptr_t = nullptr) {}
-  PDLValue(Attribute value)
-      : value(value.getAsOpaquePointer()), kind(Kind::Attribute) {}
-  PDLValue(Operation *value) : value(value), kind(Kind::Operation) {}
-  PDLValue(Type value) : value(value.getAsOpaquePointer()), kind(Kind::Type) {}
-  PDLValue(TypeRange *value) : value(value), kind(Kind::TypeRange) {}
-  PDLValue(Value value)
-      : value(value.getAsOpaquePointer()), kind(Kind::Value) {}
-  PDLValue(ValueRange *value) : value(value), kind(Kind::ValueRange) {}
-
-  /// Returns true if the type of the held value is `T`.
-  template <typename T>
-  bool isa() const {
-    assert(value && "isa<> used on a null value");
-    return kind == getKindOf<T>();
-  }
-
-  /// Attempt to dynamically cast this value to type `T`, returns null if this
-  /// value is not an instance of `T`.
-  template <typename T,
-            typename ResultT = std::conditional_t<
-                std::is_convertible<T, bool>::value, T, std::optional<T>>>
-  ResultT dyn_cast() const {
-    return isa<T>() ? castImpl<T>() : ResultT();
-  }
-
-  /// Cast this value to type `T`, asserts if this value is not an instance of
-  /// `T`.
-  template <typename T>
-  T cast() const {
-    assert(isa<T>() && "expected value to be of type `T`");
-    return castImpl<T>();
-  }
-
-  /// Get an opaque pointer to the value.
-  const void *getAsOpaquePointer() const { return value; }
-
-  /// Return if this value is null or not.
-  explicit operator bool() const { return value; }
-
-  /// Return the kind of this value.
-  Kind getKind() const { return kind; }
-
-  /// Print this value to the provided output stream.
-  void print(raw_ostream &os) const;
-
-  /// Print the specified value kind to an output stream.
-  static void print(raw_ostream &os, Kind kind);
-
-private:
-  /// Find the index of a given type in a range of other types.
-  template <typename...>
-  struct index_of_t;
-  template <typename T, typename... R>
-  struct index_of_t<T, T, R...> : std::integral_constant<size_t, 0> {};
-  template <typename T, typename F, typename... R>
-  struct index_of_t<T, F, R...>
-      : std::integral_constant<size_t, 1 + index_of_t<T, R...>::value> {};
-
-  /// Return the kind used for the given T.
-  template <typename T>
-  static Kind getKindOf() {
-    return static_cast<Kind>(index_of_t<T, Attribute, Operation *, Type,
-                                        TypeRange, Value, ValueRange>::value);
-  }
-
-  /// The internal implementation of `cast`, that returns the underlying value
-  /// as the given type `T`.
-  template <typename T>
-  std::enable_if_t<llvm::is_one_of<T, Attribute, Type, Value>::value, T>
-  castImpl() const {
-    return T::getFromOpaquePointer(value);
-  }
-  template <typename T>
-  std::enable_if_t<llvm::is_one_of<T, TypeRange, ValueRange>::value, T>
-  castImpl() const {
-    return *reinterpret_cast<T *>(const_cast<void *>(value));
-  }
-  template <typename T>
-  std::enable_if_t<std::is_pointer<T>::value, T> castImpl() const {
-    return reinterpret_cast<T>(const_cast<void *>(value));
-  }
-
-  /// The internal opaque representation of a PDLValue.
-  const void *value{nullptr};
-  /// The kind of the opaque value.
-  Kind kind{Kind::Attribute};
-};
-
-inline raw_ostream &operator<<(raw_ostream &os, PDLValue value) {
-  value.print(os);
-  return os;
-}
-
-inline raw_ostream &operator<<(raw_ostream &os, PDLValue::Kind kind) {
-  PDLValue::print(os, kind);
-  return os;
-}
-
-//===----------------------------------------------------------------------===//
-// PDLResultList
-
-/// The class represents a list of PDL results, returned by a native rewrite
-/// method. It provides the mechanism with which to pass PDLValues back to the
-/// PDL bytecode.
-class PDLResultList {
-public:
-  /// Push a new Attribute value onto the result list.
-  void push_back(Attribute value) { results.push_back(value); }
-
-  /// Push a new Operation onto the result list.
-  void push_back(Operation *value) { results.push_back(value); }
-
-  /// Push a new Type onto the result list.
-  void push_back(Type value) { results.push_back(value); }
-
-  /// Push a new TypeRange onto the result list.
-  void push_back(TypeRange value) {
-    // The lifetime of a TypeRange can't be guaranteed, so we'll need to
-    // allocate a storage for it.
-    llvm::OwningArrayRef<Type> storage(value.size());
-    llvm::copy(value, storage.begin());
-    allocatedTypeRanges.emplace_back(std::move(storage));
-    typeRanges.push_back(allocatedTypeRanges.back());
-    results.push_back(&typeRanges.back());
-  }
-  void push_back(ValueTypeRange<OperandRange> value) {
-    typeRanges.push_back(value);
-    results.push_back(&typeRanges.back());
-  }
-  void push_back(ValueTypeRange<ResultRange> value) {
-    typeRanges.push_back(value);
-    results.push_back(&typeRanges.back());
-  }
-
-  /// Push a new Value onto the result list.
-  void push_back(Value value) { results.push_back(value); }
-
-  /// Push a new ValueRange onto the result list.
-  void push_back(ValueRange value) {
-    // The lifetime of a ValueRange can't be guaranteed, so we'll need to
-    // allocate a storage for it.
-    llvm::OwningArrayRef<Value> storage(value.size());
-    llvm::copy(value, storage.begin());
-    allocatedValueRanges.emplace_back(std::move(storage));
-    valueRanges.push_back(allocatedValueRanges.back());
-    results.push_back(&valueRanges.back());
-  }
-  void push_back(OperandRange value) {
-    valueRanges.push_back(value);
-    results.push_back(&valueRanges.back());
-  }
-  void push_back(ResultRange value) {
-    valueRanges.push_back(value);
-    results.push_back(&valueRanges.back());
-  }
-
-protected:
-  /// Create a new result list with the expected number of results.
-  PDLResultList(unsigned maxNumResults) {
-    // For now just reserve enough space for all of the results. We could do
-    // separate counts per range type, but it isn't really worth it unless there
-    // are a "large" number of results.
-    typeRanges.reserve(maxNumResults);
-    valueRanges.reserve(maxNumResults);
-  }
-
-  /// The PDL results held by this list.
-  SmallVector<PDLValue> results;
-  /// Memory used to store ranges held by the list.
-  SmallVector<TypeRange> typeRanges;
-  SmallVector<ValueRange> valueRanges;
-  /// Memory allocated to store ranges in the result list whose lifetime was
-  /// generated in the native function.
-  SmallVector<llvm::OwningArrayRef<Type>> allocatedTypeRanges;
-  SmallVector<llvm::OwningArrayRef<Value>> allocatedValueRanges;
-};
-
-//===----------------------------------------------------------------------===//
-// PDLPatternConfig
-
-/// An individual configuration for a pattern, which can be accessed by native
-/// functions via the PDLPatternConfigSet. This allows for injecting additional
-/// configuration into PDL patterns that is specific to certain compilation
-/// flows.
-class PDLPatternConfig {
-public:
-  virtual ~PDLPatternConfig() = default;
-
-  /// Hooks that are invoked at the beginning and end of a rewrite of a matched
-  /// pattern. These can be used to setup any specific state necessary for the
-  /// rewrite.
-  virtual void notifyRewriteBegin(PatternRewriter &rewriter) {}
-  virtual void notifyRewriteEnd(PatternRewriter &rewriter) {}
-
-  /// Return the TypeID that represents this configuration.
-  TypeID getTypeID() const { return id; }
-
-protected:
-  PDLPatternConfig(TypeID id) : id(id) {}
-
-private:
-  TypeID id;
-};
-
-/// This class provides a base class for users implementing a type of pattern
-/// configuration.
-template <typename T>
-class PDLPatternConfigBase : public PDLPatternConfig {
-public:
-  /// Support LLVM style casting.
-  static bool classof(const PDLPatternConfig *config) {
-    return config->getTypeID() == getConfigID();
-  }
-
-  /// Return the type id used for this configuration.
-  static TypeID getConfigID() { return TypeID::get<T>(); }
-
-protected:
-  PDLPatternConfigBase() : PDLPatternConfig(getConfigID()) {}
-};
-
-/// This class contains a set of configurations for a specific pattern.
-/// Configurations are uniqued by TypeID, meaning that only one configuration of
-/// each type is allowed.
-class PDLPatternConfigSet {
-public:
-  PDLPatternConfigSet() = default;
-
-  /// Construct a set with the given configurations.
-  template <typename... ConfigsT>
-  PDLPatternConfigSet(ConfigsT &&...configs) {
-    (addConfig(std::forward<ConfigsT>(configs)), ...);
-  }
-
-  /// Get the configuration defined by the given type. Asserts that the
-  /// configuration of the provided type exists.
-  template <typename T>
-  const T &get() const {
-    const T *config = tryGet<T>();
-    assert(config && "configuration not found");
-    return *config;
-  }
-
-  /// Get the configuration defined by the given type, returns nullptr if the
-  /// configuration does not exist.
-  template <typename T>
-  const T *tryGet() const {
-    for (const auto &configIt : configs)
-      if (const T *config = dyn_cast<T>(configIt.get()))
-        return config;
-    return nullptr;
-  }
-
-  /// Notify the configurations within this set at the beginning or end of a
-  /// rewrite of a matched pattern.
-  void notifyRewriteBegin(PatternRewriter &rewriter) {
-    for (const auto &config : configs)
-      config->notifyRewriteBegin(rewriter);
-  }
-  void notifyRewriteEnd(PatternRewriter &rewriter) {
-    for (const auto &config : configs)
-      config->notifyRewriteEnd(rewriter);
-  }
-
-protected:
-  /// Add a configuration to the set.
-  template <typename T>
-  void addConfig(T &&config) {
-    assert(!tryGet<std::decay_t<T>>() && "configuration already exists");
-    configs.emplace_back(
-        std::make_unique<std::decay_t<T>>(std::forward<T>(config)));
-  }
-
-  /// The set of configurations for this pattern. This uses a vector instead of
-  /// a map with the expectation that the number of configurations per set is
-  /// small (<= 1).
-  SmallVector<std::unique_ptr<PDLPatternConfig>> configs;
-};
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-
-/// A generic PDL pattern constraint function. This function applies a
-/// constraint to a given set of opaque PDLValue entities. Returns success if
-/// the constraint successfully held, failure otherwise.
-using PDLConstraintFunction =
-    std::function<LogicalResult(PatternRewriter &, ArrayRef<PDLValue>)>;
-/// A native PDL rewrite function. This function performs a rewrite on the
-/// given set of values. Any results from this rewrite that should be passed
-/// back to PDL should be added to the provided result list. This method is only
-/// invoked when the corresponding match was successful. Returns failure if an
-/// invariant of the rewrite was broken (certain rewriters may recover from
-/// partial pattern application).
-using PDLRewriteFunction = std::function<LogicalResult(
-    PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)>;
-
-namespace detail {
-namespace pdl_function_builder {
-/// A utility variable that always resolves to false. This is useful for static
-/// asserts that are always false, but only should fire in certain templated
-/// constructs. For example, if a templated function should never be called, the
-/// function could be defined as:
-///
-/// template <typename T>
-/// void foo() {
-///  static_assert(always_false<T>, "This function should never be called");
-/// }
-///
-template <class... T>
-constexpr bool always_false = false;
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Type Processing
-//===----------------------------------------------------------------------===//
-
-/// This struct provides a convenient way to determine how to process a given
-/// type as either a PDL parameter, or a result value. This allows for
-/// supporting complex types in constraint and rewrite functions, without
-/// requiring the user to hand-write the necessary glue code themselves.
-/// Specializations of this class should implement the following methods to
-/// enable support as a PDL argument or result type:
-///
-///   static LogicalResult verifyAsArg(
-///     function_ref<LogicalResult(const Twine &)> errorFn, PDLValue pdlValue,
-///     size_t argIdx);
-///
-///     * This method verifies that the given PDLValue is valid for use as a
-///       value of `T`.
-///
-///   static T processAsArg(PDLValue pdlValue);
-///
-///     *  This method processes the given PDLValue as a value of `T`.
-///
-///   static void processAsResult(PatternRewriter &, PDLResultList &results,
-///                               const T &value);
-///
-///     *  This method processes the given value of `T` as the result of a
-///        function invocation. The method should package the value into an
-///        appropriate form and append it to the given result list.
-///
-/// If the type `T` is based on a higher order value, consider using
-/// `ProcessPDLValueBasedOn` as a base class of the specialization to simplify
-/// the implementation.
-///
-template <typename T, typename Enable = void>
-struct ProcessPDLValue;
-
-/// This struct provides a simplified model for processing types that are based
-/// on another type, e.g. APInt is based on the handling for IntegerAttr. This
-/// allows for building the necessary processing functions on top of the base
-/// value instead of a PDLValue. Derived users should implement the following
-/// (which subsume the ProcessPDLValue variants):
-///
-///   static LogicalResult verifyAsArg(
-///     function_ref<LogicalResult(const Twine &)> errorFn,
-///     const BaseT &baseValue, size_t argIdx);
-///
-///     * This method verifies that the given PDLValue is valid for use as a
-///       value of `T`.
-///
-///   static T processAsArg(BaseT baseValue);
-///
-///     *  This method processes the given base value as a value of `T`.
-///
-template <typename T, typename BaseT>
-struct ProcessPDLValueBasedOn {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              PDLValue pdlValue, size_t argIdx) {
-    // Verify the base class before continuing.
-    if (failed(ProcessPDLValue<BaseT>::verifyAsArg(errorFn, pdlValue, argIdx)))
-      return failure();
-    return ProcessPDLValue<T>::verifyAsArg(
-        errorFn, ProcessPDLValue<BaseT>::processAsArg(pdlValue), argIdx);
-  }
-  static T processAsArg(PDLValue pdlValue) {
-    return ProcessPDLValue<T>::processAsArg(
-        ProcessPDLValue<BaseT>::processAsArg(pdlValue));
-  }
-
-  /// Explicitly add the expected parent API to ensure the parent class
-  /// implements the necessary API (and doesn't implicitly inherit it from
-  /// somewhere else).
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn, BaseT value,
-              size_t argIdx) {
-    return success();
-  }
-  static T processAsArg(BaseT baseValue);
-};
-
-/// This struct provides a simplified model for processing types that have
-/// "builtin" PDLValue support:
-///   * Attribute, Operation *, Type, TypeRange, ValueRange
-template <typename T>
-struct ProcessBuiltinPDLValue {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              PDLValue pdlValue, size_t argIdx) {
-    if (pdlValue)
-      return success();
-    return errorFn("expected a non-null value for argument " + Twine(argIdx) +
-                   " of type: " + llvm::getTypeName<T>());
-  }
-
-  static T processAsArg(PDLValue pdlValue) { return pdlValue.cast<T>(); }
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              T value) {
-    results.push_back(value);
-  }
-};
-
-/// This struct provides a simplified model for processing types that inherit
-/// from builtin PDLValue types. For example, derived attributes like
-/// IntegerAttr, derived types like IntegerType, derived operations like
-/// ModuleOp, Interfaces, etc.
-template <typename T, typename BaseT>
-struct ProcessDerivedPDLValue : public ProcessPDLValueBasedOn<T, BaseT> {
-  static LogicalResult
-  verifyAsArg(function_ref<LogicalResult(const Twine &)> errorFn,
-              BaseT baseValue, size_t argIdx) {
-    return TypeSwitch<BaseT, LogicalResult>(baseValue)
-        .Case([&](T) { return success(); })
-        .Default([&](BaseT) {
-          return errorFn("expected argument " + Twine(argIdx) +
-                         " to be of type: " + llvm::getTypeName<T>());
-        });
-  }
-  using ProcessPDLValueBasedOn<T, BaseT>::verifyAsArg;
-
-  static T processAsArg(BaseT baseValue) {
-    return baseValue.template cast<T>();
-  }
-  using ProcessPDLValueBasedOn<T, BaseT>::processAsArg;
-
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              T value) {
-    results.push_back(value);
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Attribute
-
-template <>
-struct ProcessPDLValue<Attribute> : public ProcessBuiltinPDLValue<Attribute> {};
-template <typename T>
-struct ProcessPDLValue<T,
-                       std::enable_if_t<std::is_base_of<Attribute, T>::value>>
-    : public ProcessDerivedPDLValue<T, Attribute> {};
-
-/// Handling for various Attribute value types.
-template <>
-struct ProcessPDLValue<StringRef>
-    : public ProcessPDLValueBasedOn<StringRef, StringAttr> {
-  static StringRef processAsArg(StringAttr value) { return value.getValue(); }
-  using ProcessPDLValueBasedOn<StringRef, StringAttr>::processAsArg;
-
-  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
-                              StringRef value) {
-    results.push_back(rewriter.getStringAttr(value));
-  }
-};
-template <>
-struct ProcessPDLValue<std::string>
-    : public ProcessPDLValueBasedOn<std::string, StringAttr> {
-  template <typename T>
-  static std::string processAsArg(T value) {
-    static_assert(always_false<T>,
-                  "`std::string` arguments require a string copy, use "
-                  "`StringRef` for string-like arguments instead");
-    return {};
-  }
-  static void processAsResult(PatternRewriter &rewriter, PDLResultList &results,
-                              StringRef value) {
-    results.push_back(rewriter.getStringAttr(value));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Operation
-
-template <>
-struct ProcessPDLValue<Operation *>
-    : public ProcessBuiltinPDLValue<Operation *> {};
-template <typename T>
-struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<OpState, T>::value>>
-    : public ProcessDerivedPDLValue<T, Operation *> {
-  static T processAsArg(Operation *value) { return cast<T>(value); }
-};
-
-//===----------------------------------------------------------------------===//
-// Type
-
-template <>
-struct ProcessPDLValue<Type> : public ProcessBuiltinPDLValue<Type> {};
-template <typename T>
-struct ProcessPDLValue<T, std::enable_if_t<std::is_base_of<Type, T>::value>>
-    : public ProcessDerivedPDLValue<T, Type> {};
-
-//===----------------------------------------------------------------------===//
-// TypeRange
-
-template <>
-struct ProcessPDLValue<TypeRange> : public ProcessBuiltinPDLValue<TypeRange> {};
-template <>
-struct ProcessPDLValue<ValueTypeRange<OperandRange>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ValueTypeRange<OperandRange> types) {
-    results.push_back(types);
-  }
-};
-template <>
-struct ProcessPDLValue<ValueTypeRange<ResultRange>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ValueTypeRange<ResultRange> types) {
-    results.push_back(types);
-  }
-};
-template <unsigned N>
-struct ProcessPDLValue<SmallVector<Type, N>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              SmallVector<Type, N> values) {
-    results.push_back(TypeRange(values));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Value
-
-template <>
-struct ProcessPDLValue<Value> : public ProcessBuiltinPDLValue<Value> {};
-
-//===----------------------------------------------------------------------===//
-// ValueRange
-
-template <>
-struct ProcessPDLValue<ValueRange> : public ProcessBuiltinPDLValue<ValueRange> {
-};
-template <>
-struct ProcessPDLValue<OperandRange> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              OperandRange values) {
-    results.push_back(values);
-  }
-};
-template <>
-struct ProcessPDLValue<ResultRange> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              ResultRange values) {
-    results.push_back(values);
-  }
-};
-template <unsigned N>
-struct ProcessPDLValue<SmallVector<Value, N>> {
-  static void processAsResult(PatternRewriter &, PDLResultList &results,
-                              SmallVector<Value, N> values) {
-    results.push_back(ValueRange(values));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Argument Handling
-//===----------------------------------------------------------------------===//
-
-/// Validate the given PDLValues match the constraints defined by the argument
-/// types of the given function. In the case of failure, a match failure
-/// diagnostic is emitted.
-/// FIXME: This should be completely removed in favor of `assertArgs`, but PDL
-/// does not currently preserve Constraint application ordering.
-template <typename PDLFnT, std::size_t... I>
-LogicalResult verifyAsArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
-                           std::index_sequence<I...>) {
-  using FnTraitsT = llvm::function_traits<PDLFnT>;
-
-  auto errorFn = [&](const Twine &msg) {
-    return rewriter.notifyMatchFailure(rewriter.getUnknownLoc(), msg);
-  };
-  return success(
-      (succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                     verifyAsArg(errorFn, values[I], I)) &&
-       ...));
-}
-
-/// Assert that the given PDLValues match the constraints defined by the
-/// arguments of the given function. In the case of failure, a fatal error
-/// is emitted.
-template <typename PDLFnT, std::size_t... I>
-void assertArgs(PatternRewriter &rewriter, ArrayRef<PDLValue> values,
-                std::index_sequence<I...>) {
-  // We only want to do verification in debug builds, same as with `assert`.
-#if LLVM_ENABLE_ABI_BREAKING_CHECKS
-  using FnTraitsT = llvm::function_traits<PDLFnT>;
-  auto errorFn = [&](const Twine &msg) -> LogicalResult {
-    llvm::report_fatal_error(msg);
-  };
-  (void)errorFn;
-  assert((succeeded(ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                        verifyAsArg(errorFn, values[I], I)) &&
-          ...));
-#endif
-  (void)values;
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Function Builder: Results Handling
-//===----------------------------------------------------------------------===//
-
-/// Store a single result within the result list.
-template <typename T>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results, T &&value) {
-  ProcessPDLValue<T>::processAsResult(rewriter, results,
-                                      std::forward<T>(value));
-  return success();
-}
-
-/// Store a std::pair<> as individual results within the result list.
-template <typename T1, typename T2>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    std::pair<T1, T2> &&pair) {
-  if (failed(processResults(rewriter, results, std::move(pair.first))) ||
-      failed(processResults(rewriter, results, std::move(pair.second))))
-    return failure();
-  return success();
-}
-
-/// Store a std::tuple<> as individual results within the result list.
-template <typename... Ts>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    std::tuple<Ts...> &&tuple) {
-  auto applyFn = [&](auto &&...args) {
-    return (succeeded(processResults(rewriter, results, std::move(args))) &&
-            ...);
-  };
-  return success(std::apply(applyFn, std::move(tuple)));
-}
-
-/// Handle LogicalResult propagation.
-inline LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    LogicalResult &&result) {
-  return result;
-}
-template <typename T>
-static LogicalResult processResults(PatternRewriter &rewriter,
-                                    PDLResultList &results,
-                                    FailureOr<T> &&result) {
-  if (failed(result))
-    return failure();
-  return processResults(rewriter, results, std::move(*result));
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Constraint Builder
-//===----------------------------------------------------------------------===//
-
-/// Process the arguments of a native constraint and invoke it.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-typename FnTraitsT::result_t
-processArgsAndInvokeConstraint(PDLFnT &fn, PatternRewriter &rewriter,
-                               ArrayRef<PDLValue> values,
-                               std::index_sequence<I...>) {
-  return fn(
-      rewriter,
-      (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
-          values[I]))...);
-}
-
-/// Build a constraint function from the given function `ConstraintFnT`. This
-/// allows for enabling the user to define simpler, more direct constraint
-/// functions without needing to handle the low-level PDL goop.
-///
-/// If the constraint function is already in the correct form, we just forward
-/// it directly.
-template <typename ConstraintFnT>
-std::enable_if_t<
-    std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
-    PDLConstraintFunction>
-buildConstraintFn(ConstraintFnT &&constraintFn) {
-  return std::forward<ConstraintFnT>(constraintFn);
-}
-/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
-/// we desire.
-template <typename ConstraintFnT>
-std::enable_if_t<
-    !std::is_convertible<ConstraintFnT, PDLConstraintFunction>::value,
-    PDLConstraintFunction>
-buildConstraintFn(ConstraintFnT &&constraintFn) {
-  return [constraintFn = std::forward<ConstraintFnT>(constraintFn)](
-             PatternRewriter &rewriter,
-             ArrayRef<PDLValue> values) -> LogicalResult {
-    auto argIndices = std::make_index_sequence<
-        llvm::function_traits<ConstraintFnT>::num_args - 1>();
-    if (failed(verifyAsArgs<ConstraintFnT>(rewriter, values, argIndices)))
-      return failure();
-    return processArgsAndInvokeConstraint(constraintFn, rewriter, values,
-                                          argIndices);
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// PDL Rewrite Builder
-//===----------------------------------------------------------------------===//
-
-/// Process the arguments of a native rewrite and invoke it.
-/// This overload handles the case of no return values.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-std::enable_if_t<std::is_same<typename FnTraitsT::result_t, void>::value,
-                 LogicalResult>
-processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
-                            PDLResultList &, ArrayRef<PDLValue> values,
-                            std::index_sequence<I...>) {
-  fn(rewriter,
-     (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::processAsArg(
-         values[I]))...);
-  return success();
-}
-/// This overload handles the case of return values, which need to be packaged
-/// into the result list.
-template <typename PDLFnT, std::size_t... I,
-          typename FnTraitsT = llvm::function_traits<PDLFnT>>
-std::enable_if_t<!std::is_same<typename FnTraitsT::result_t, void>::value,
-                 LogicalResult>
-processArgsAndInvokeRewrite(PDLFnT &fn, PatternRewriter &rewriter,
-                            PDLResultList &results, ArrayRef<PDLValue> values,
-                            std::index_sequence<I...>) {
-  return processResults(
-      rewriter, results,
-      fn(rewriter, (ProcessPDLValue<typename FnTraitsT::template arg_t<I + 1>>::
-                        processAsArg(values[I]))...));
-  (void)values;
-}
-
-/// Build a rewrite function from the given function `RewriteFnT`. This
-/// allows for enabling the user to define simpler, more direct rewrite
-/// functions without needing to handle the low-level PDL goop.
-///
-/// If the rewrite function is already in the correct form, we just forward
-/// it directly.
-template <typename RewriteFnT>
-std::enable_if_t<std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
-                 PDLRewriteFunction>
-buildRewriteFn(RewriteFnT &&rewriteFn) {
-  return std::forward<RewriteFnT>(rewriteFn);
-}
-/// Otherwise, we generate a wrapper that will unpack the PDLValues in the form
-/// we desire.
-template <typename RewriteFnT>
-std::enable_if_t<!std::is_convertible<RewriteFnT, PDLRewriteFunction>::value,
-                 PDLRewriteFunction>
-buildRewriteFn(RewriteFnT &&rewriteFn) {
-  return [rewriteFn = std::forward<RewriteFnT>(rewriteFn)](
-             PatternRewriter &rewriter, PDLResultList &results,
-             ArrayRef<PDLValue> values) {
-    auto argIndices =
-        std::make_index_sequence<llvm::function_traits<RewriteFnT>::num_args -
-                                 1>();
-    assertArgs<RewriteFnT>(rewriter, values, argIndices);
-    return processArgsAndInvokeRewrite(rewriteFn, rewriter, results, values,
-                                       argIndices);
-  };
-}
-
-} // namespace pdl_function_builder
-} // namespace detail
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-
-/// This class contains all of the necessary data for a set of PDL patterns, or
-/// pattern rewrites specified in the form of the PDL dialect. This PDL module
-/// contained by this pattern may contain any number of `pdl.pattern`
-/// operations.
-class PDLPatternModule {
-public:
-  PDLPatternModule() = default;
-
-  /// Construct a PDL pattern with the given module and configurations.
-  PDLPatternModule(OwningOpRef<ModuleOp> module)
-      : pdlModule(std::move(module)) {}
-  template <typename... ConfigsT>
-  PDLPatternModule(OwningOpRef<ModuleOp> module, ConfigsT &&...patternConfigs)
-      : PDLPatternModule(std::move(module)) {
-    auto configSet = std::make_unique<PDLPatternConfigSet>(
-        std::forward<ConfigsT>(patternConfigs)...);
-    attachConfigToPatterns(*pdlModule, *configSet);
-    configs.emplace_back(std::move(configSet));
-  }
-
-  /// Merge the state in `other` into this pattern module.
-  void mergeIn(PDLPatternModule &&other);
-
-  /// Return the internal PDL module of this pattern.
-  ModuleOp getModule() { return pdlModule.get(); }
-
-  //===--------------------------------------------------------------------===//
-  // Function Registry
-
-  /// Register a constraint function with PDL. A constraint function may be
-  /// specified in one of two ways:
-  ///
-  ///   * `LogicalResult (PatternRewriter &, ArrayRef<PDLValue>)`
-  ///
-  ///   In this overload the arguments of the constraint function are passed via
-  ///   the low-level PDLValue form.
-  ///
-  ///   * `LogicalResult (PatternRewriter &, ValueTs... values)`
-  ///
-  ///   In this form the arguments of the constraint function are passed via the
-  ///   expected high level C++ type. In this form, the framework will
-  ///   automatically unwrap PDLValues and convert them to the expected ValueTs.
-  ///   For example, if the constraint function accepts a `Operation *`, the
-  ///   framework will automatically cast the input PDLValue. In the case of a
-  ///   `StringRef`, the framework will automatically unwrap the argument as a
-  ///   StringAttr and pass the underlying string value. To see the full list of
-  ///   supported types, or to see how to add handling for custom types, view
-  ///   the definition of `ProcessPDLValue` above.
-  void registerConstraintFunction(StringRef name,
-                                  PDLConstraintFunction constraintFn);
-  template <typename ConstraintFnT>
-  void registerConstraintFunction(StringRef name,
-                                  ConstraintFnT &&constraintFn) {
-    registerConstraintFunction(name,
-                               detail::pdl_function_builder::buildConstraintFn(
-                                   std::forward<ConstraintFnT>(constraintFn)));
-  }
-
-  /// Register a rewrite function with PDL. A rewrite function may be specified
-  /// in one of two ways:
-  ///
-  ///   * `void (PatternRewriter &, PDLResultList &, ArrayRef<PDLValue>)`
-  ///
-  ///   In this overload the arguments of the constraint function are passed via
-  ///   the low-level PDLValue form, and the results are manually appended to
-  ///   the given result list.
-  ///
-  ///   * `ResultT (PatternRewriter &, ValueTs... values)`
-  ///
-  ///   In this form the arguments and result of the rewrite function are passed
-  ///   via the expected high level C++ type. In this form, the framework will
-  ///   automatically unwrap the PDLValues arguments and convert them to the
-  ///   expected ValueTs. It will also automatically handle the processing and
-  ///   packaging of the result value to the result list. For example, if the
-  ///   rewrite function takes a `Operation *`, the framework will automatically
-  ///   cast the input PDLValue. In the case of a `StringRef`, the framework
-  ///   will automatically unwrap the argument as a StringAttr and pass the
-  ///   underlying string value. In the reverse case, if the rewrite returns a
-  ///   StringRef or std::string, it will automatically package this as a
-  ///   StringAttr and append it to the result list. To see the full list of
-  ///   supported types, or to see how to add handling for custom types, view
-  ///   the definition of `ProcessPDLValue` above.
-  void registerRewriteFunction(StringRef name, PDLRewriteFunction rewriteFn);
-  template <typename RewriteFnT>
-  void registerRewriteFunction(StringRef name, RewriteFnT &&rewriteFn) {
-    registerRewriteFunction(name, detail::pdl_function_builder::buildRewriteFn(
-                                      std::forward<RewriteFnT>(rewriteFn)));
-  }
-
-  /// Return the set of the registered constraint functions.
-  const llvm::StringMap<PDLConstraintFunction> &getConstraintFunctions() const {
-    return constraintFunctions;
-  }
-  llvm::StringMap<PDLConstraintFunction> takeConstraintFunctions() {
-    return constraintFunctions;
-  }
-  /// Return the set of the registered rewrite functions.
-  const llvm::StringMap<PDLRewriteFunction> &getRewriteFunctions() const {
-    return rewriteFunctions;
-  }
-  llvm::StringMap<PDLRewriteFunction> takeRewriteFunctions() {
-    return rewriteFunctions;
-  }
-
-  /// Return the set of the registered pattern configs.
-  SmallVector<std::unique_ptr<PDLPatternConfigSet>> takeConfigs() {
-    return std::move(configs);
-  }
-  DenseMap<Operation *, PDLPatternConfigSet *> takeConfigMap() {
-    return std::move(configMap);
-  }
-
-  /// Clear out the patterns and functions within this module.
-  void clear() {
-    pdlModule = nullptr;
-    constraintFunctions.clear();
-    rewriteFunctions.clear();
-  }
-
-private:
-  /// Attach the given pattern config set to the patterns defined within the
-  /// given module.
-  void attachConfigToPatterns(ModuleOp module, PDLPatternConfigSet &configSet);
-
-  /// The module containing the `pdl.pattern` operations.
-  OwningOpRef<ModuleOp> pdlModule;
+} // namespace mlir
 
-  /// The set of configuration sets referenced by patterns within `pdlModule`.
-  SmallVector<std::unique_ptr<PDLPatternConfigSet>> configs;
-  DenseMap<Operation *, PDLPatternConfigSet *> configMap;
+// Optionally expose PDL pattern matching methods.
+#include "PDLPatternMatch.h.inc"
 
-  /// The external functions referenced from within the PDL module.
-  llvm::StringMap<PDLConstraintFunction> constraintFunctions;
-  llvm::StringMap<PDLRewriteFunction> rewriteFunctions;
-};
+namespace mlir {
 
 //===----------------------------------------------------------------------===//
 // RewritePatternSet
@@ -1679,8 +759,7 @@ class RewritePatternSet {
     nativePatterns.emplace_back(std::move(pattern));
   }
   RewritePatternSet(PDLPatternModule &&pattern)
-      : context(pattern.getModule()->getContext()),
-        pdlPatterns(std::move(pattern)) {}
+      : context(pattern.getContext()), pdlPatterns(std::move(pattern)) {}
 
   MLIRContext *getContext() const { return context; }
 
@@ -1853,6 +932,7 @@ class RewritePatternSet {
     pattern->addDebugLabels(debugLabels);
     nativePatterns.emplace_back(std::move(pattern));
   }
+
   template <typename T, typename... Args>
   std::enable_if_t<std::is_base_of<PDLPatternModule, T>::value>
   addImpl(ArrayRef<StringRef> debugLabels, Args &&...args) {
@@ -1863,6 +943,9 @@ class RewritePatternSet {
 
   MLIRContext *const context;
   NativePatternListT nativePatterns;
+
+  // Patterns expressed with PDL. This will compile to a stub class when PDL is
+  // not enabled.
   PDLPatternModule pdlPatterns;
 };
 
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 6de981d35c8c3..c5725e9c85625 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_TRANSFORMS_DIALECTCONVERSION_H_
 #define MLIR_TRANSFORMS_DIALECTCONVERSION_H_
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -1015,6 +1016,7 @@ class ConversionTarget {
   MLIRContext &ctx;
 };
 
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 //===----------------------------------------------------------------------===//
 // PDL Configuration
 //===----------------------------------------------------------------------===//
@@ -1044,6 +1046,19 @@ class PDLConversionConfig final
 /// Register the dialect conversion PDL functions with the given pattern set.
 void registerConversionPDLFunctions(RewritePatternSet &patterns);
 
+#else
+
+// Stubs for when PDL in rewriting is not enabled.
+
+inline void registerConversionPDLFunctions(RewritePatternSet &patterns) {}
+
+class PDLConversionConfig final {
+public:
+  PDLConversionConfig(const TypeConverter * /*converter*/) {}
+};
+
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
 //===----------------------------------------------------------------------===//
 // Op Conversion Entry Points
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/ComplexToLibm/CMakeLists.txt b/mlir/lib/Conversion/ComplexToLibm/CMakeLists.txt
index f0b994dbd114d..5465e4b9fd969 100644
--- a/mlir/lib/Conversion/ComplexToLibm/CMakeLists.txt
+++ b/mlir/lib/Conversion/ComplexToLibm/CMakeLists.txt
@@ -11,8 +11,9 @@ add_mlir_conversion_library(MLIRComplexToLibm
   Core
 
   LINK_LIBS PUBLIC
+  MLIRComplexDialect
   MLIRDialectUtils
   MLIRFuncDialect
-  MLIRComplexDialect
+  MLIRPass
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Conversion/MathToLibm/CMakeLists.txt b/mlir/lib/Conversion/MathToLibm/CMakeLists.txt
index 0e7c6f075603a..61c46e9bfe250 100644
--- a/mlir/lib/Conversion/MathToLibm/CMakeLists.txt
+++ b/mlir/lib/Conversion/MathToLibm/CMakeLists.txt
@@ -15,6 +15,7 @@ add_mlir_conversion_library(MLIRMathToLibm
   MLIRDialectUtils
   MLIRFuncDialect
   MLIRMathDialect
+  MLIRPass
   MLIRTransformUtils
   MLIRVectorDialect
   MLIRVectorUtils
diff --git a/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
index 7f7b348b17ae6..be5eb73b91229 100644
--- a/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/TransformOps/CMakeLists.txt
@@ -14,7 +14,6 @@ add_mlir_dialect_library(MLIRBufferizationTransformOps
   MLIRFunctionInterfaces
   MLIRLinalgDialect
   MLIRParser
-  MLIRPDLDialect
   MLIRSideEffectInterfaces
   MLIRTransformDialect
   )
diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt
index 21bba11b85117..c38ce6c058a00 100644
--- a/mlir/lib/IR/CMakeLists.txt
+++ b/mlir/lib/IR/CMakeLists.txt
@@ -1,3 +1,9 @@
+if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+  set(pdl_src
+    PDL/PDLPatternMatch.cpp
+  )
+endif()
+
 add_mlir_library(MLIRIR
   AffineExpr.cpp
   AffineMap.cpp
@@ -36,6 +42,7 @@ add_mlir_library(MLIRIR
   ValueRange.cpp
   Verifier.cpp
   Visitors.cpp
+  ${pdl_src}
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
@@ -61,3 +68,4 @@ add_mlir_library(MLIRIR
   LINK_LIBS PUBLIC
   MLIRSupport
   )
+
diff --git a/mlir/lib/IR/PDL/PDLPatternMatch.cpp b/mlir/lib/IR/PDL/PDLPatternMatch.cpp
new file mode 100644
index 0000000000000..da07cc462a5a1
--- /dev/null
+++ b/mlir/lib/IR/PDL/PDLPatternMatch.cpp
@@ -0,0 +1,133 @@
+//===- PDLPatternMatch.cpp - Base classes for PDL pattern match
+//------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/RegionKindInterface.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// PDLValue
+//===----------------------------------------------------------------------===//
+
+void PDLValue::print(raw_ostream &os) const {
+  if (!value) {
+    os << "<NULL-PDLValue>";
+    return;
+  }
+  switch (kind) {
+  case Kind::Attribute:
+    os << cast<Attribute>();
+    break;
+  case Kind::Operation:
+    os << *cast<Operation *>();
+    break;
+  case Kind::Type:
+    os << cast<Type>();
+    break;
+  case Kind::TypeRange:
+    llvm::interleaveComma(cast<TypeRange>(), os);
+    break;
+  case Kind::Value:
+    os << cast<Value>();
+    break;
+  case Kind::ValueRange:
+    llvm::interleaveComma(cast<ValueRange>(), os);
+    break;
+  }
+}
+
+void PDLValue::print(raw_ostream &os, Kind kind) {
+  switch (kind) {
+  case Kind::Attribute:
+    os << "Attribute";
+    break;
+  case Kind::Operation:
+    os << "Operation";
+    break;
+  case Kind::Type:
+    os << "Type";
+    break;
+  case Kind::TypeRange:
+    os << "TypeRange";
+    break;
+  case Kind::Value:
+    os << "Value";
+    break;
+  case Kind::ValueRange:
+    os << "ValueRange";
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PDLPatternModule
+//===----------------------------------------------------------------------===//
+
+void PDLPatternModule::mergeIn(PDLPatternModule &&other) {
+  // Ignore the other module if it has no patterns.
+  if (!other.pdlModule)
+    return;
+
+  // Steal the functions and config of the other module.
+  for (auto &it : other.constraintFunctions)
+    registerConstraintFunction(it.first(), std::move(it.second));
+  for (auto &it : other.rewriteFunctions)
+    registerRewriteFunction(it.first(), std::move(it.second));
+  for (auto &it : other.configs)
+    configs.emplace_back(std::move(it));
+  for (auto &it : other.configMap)
+    configMap.insert(it);
+
+  // Steal the other state if we have no patterns.
+  if (!pdlModule) {
+    pdlModule = std::move(other.pdlModule);
+    return;
+  }
+
+  // Merge the pattern operations from the other module into this one.
+  Block *block = pdlModule->getBody();
+  block->getOperations().splice(block->end(),
+                                other.pdlModule->getBody()->getOperations());
+}
+
+void PDLPatternModule::attachConfigToPatterns(ModuleOp module,
+                                              PDLPatternConfigSet &configSet) {
+  // Attach the configuration to the symbols within the module. We only add
+  // to symbols to avoid hardcoding any specific operation names here (given
+  // that we don't depend on any PDL dialect). We can't use
+  // cast<SymbolOpInterface> here because patterns may be optional symbols.
+  module->walk([&](Operation *op) {
+    if (op->hasTrait<SymbolOpInterface::Trait>())
+      configMap[op] = &configSet;
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// Function Registry
+
+void PDLPatternModule::registerConstraintFunction(
+    StringRef name, PDLConstraintFunction constraintFn) {
+  // TODO: Is it possible to diagnose when `name` is already registered to
+  // a function that is not equivalent to `constraintFn`?
+  // Allow existing mappings in the case multiple patterns depend on the same
+  // constraint.
+  constraintFunctions.try_emplace(name, std::move(constraintFn));
+}
+
+void PDLPatternModule::registerRewriteFunction(StringRef name,
+                                               PDLRewriteFunction rewriteFn) {
+  // TODO: Is it possible to diagnose when `name` is already registered to
+  // a function that is not equivalent to `rewriteFn`?
+  // Allow existing mappings in the case multiple patterns depend on the same
+  // rewrite.
+  rewriteFunctions.try_emplace(name, std::move(rewriteFn));
+}
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
index 5e9b9b2a810a4..5e788cdb4897d 100644
--- a/mlir/lib/IR/PatternMatch.cpp
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Config/mlir-config.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/IR/RegionKindInterface.h"
@@ -97,124 +98,6 @@ LogicalResult RewritePattern::match(Operation *op) const {
 /// Out-of-line vtable anchor.
 void RewritePattern::anchor() {}
 
-//===----------------------------------------------------------------------===//
-// PDLValue
-//===----------------------------------------------------------------------===//
-
-void PDLValue::print(raw_ostream &os) const {
-  if (!value) {
-    os << "<NULL-PDLValue>";
-    return;
-  }
-  switch (kind) {
-  case Kind::Attribute:
-    os << cast<Attribute>();
-    break;
-  case Kind::Operation:
-    os << *cast<Operation *>();
-    break;
-  case Kind::Type:
-    os << cast<Type>();
-    break;
-  case Kind::TypeRange:
-    llvm::interleaveComma(cast<TypeRange>(), os);
-    break;
-  case Kind::Value:
-    os << cast<Value>();
-    break;
-  case Kind::ValueRange:
-    llvm::interleaveComma(cast<ValueRange>(), os);
-    break;
-  }
-}
-
-void PDLValue::print(raw_ostream &os, Kind kind) {
-  switch (kind) {
-  case Kind::Attribute:
-    os << "Attribute";
-    break;
-  case Kind::Operation:
-    os << "Operation";
-    break;
-  case Kind::Type:
-    os << "Type";
-    break;
-  case Kind::TypeRange:
-    os << "TypeRange";
-    break;
-  case Kind::Value:
-    os << "Value";
-    break;
-  case Kind::ValueRange:
-    os << "ValueRange";
-    break;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// PDLPatternModule
-//===----------------------------------------------------------------------===//
-
-void PDLPatternModule::mergeIn(PDLPatternModule &&other) {
-  // Ignore the other module if it has no patterns.
-  if (!other.pdlModule)
-    return;
-
-  // Steal the functions and config of the other module.
-  for (auto &it : other.constraintFunctions)
-    registerConstraintFunction(it.first(), std::move(it.second));
-  for (auto &it : other.rewriteFunctions)
-    registerRewriteFunction(it.first(), std::move(it.second));
-  for (auto &it : other.configs)
-    configs.emplace_back(std::move(it));
-  for (auto &it : other.configMap)
-    configMap.insert(it);
-
-  // Steal the other state if we have no patterns.
-  if (!pdlModule) {
-    pdlModule = std::move(other.pdlModule);
-    return;
-  }
-
-  // Merge the pattern operations from the other module into this one.
-  Block *block = pdlModule->getBody();
-  block->getOperations().splice(block->end(),
-                                other.pdlModule->getBody()->getOperations());
-}
-
-void PDLPatternModule::attachConfigToPatterns(ModuleOp module,
-                                              PDLPatternConfigSet &configSet) {
-  // Attach the configuration to the symbols within the module. We only add
-  // to symbols to avoid hardcoding any specific operation names here (given
-  // that we don't depend on any PDL dialect). We can't use
-  // cast<SymbolOpInterface> here because patterns may be optional symbols.
-  module->walk([&](Operation *op) {
-    if (op->hasTrait<SymbolOpInterface::Trait>())
-      configMap[op] = &configSet;
-  });
-}
-
-//===----------------------------------------------------------------------===//
-// Function Registry
-
-void PDLPatternModule::registerConstraintFunction(
-    StringRef name, PDLConstraintFunction constraintFn) {
-  // TODO: Is it possible to diagnose when `name` is already registered to
-  // a function that is not equivalent to `constraintFn`?
-  // Allow existing mappings in the case multiple patterns depend on the same
-  // constraint.
-  constraintFunctions.try_emplace(name, std::move(constraintFn));
-}
-
-void PDLPatternModule::registerRewriteFunction(StringRef name,
-                                               PDLRewriteFunction rewriteFn) {
-  // TODO: Is it possible to diagnose when `name` is already registered to
-  // a function that is not equivalent to `rewriteFn`?
-  // Allow existing mappings in the case multiple patterns depend on the same
-  // rewrite.
-  rewriteFunctions.try_emplace(name, std::move(rewriteFn));
-}
-
 //===----------------------------------------------------------------------===//
 // RewriterBase
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Rewrite/ByteCode.h b/mlir/lib/Rewrite/ByteCode.h
index 4d43fe636bd1f..4aceac7ed3a4c 100644
--- a/mlir/lib/Rewrite/ByteCode.h
+++ b/mlir/lib/Rewrite/ByteCode.h
@@ -16,6 +16,8 @@
 
 #include "mlir/IR/PatternMatch.h"
 
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
 namespace mlir {
 namespace pdl_interp {
 class RecordMatchOp;
@@ -224,4 +226,38 @@ class PDLByteCode {
 } // namespace detail
 } // namespace mlir
 
+#else
+
+namespace mlir::detail {
+
+class PDLByteCodeMutableState {
+public:
+  void cleanupAfterMatchAndRewrite() {}
+  void updatePatternBenefit(unsigned patternIndex, PatternBenefit benefit) {}
+};
+
+class PDLByteCodePattern : public Pattern {};
+
+class PDLByteCode {
+public:
+  struct MatchResult {
+    const PDLByteCodePattern *pattern = nullptr;
+    PatternBenefit benefit;
+  };
+
+  void initializeMutableState(PDLByteCodeMutableState &state) const {}
+  void match(Operation *op, PatternRewriter &rewriter,
+             SmallVectorImpl<MatchResult> &matches,
+             PDLByteCodeMutableState &state) const {}
+  LogicalResult rewrite(PatternRewriter &rewriter, const MatchResult &match,
+                        PDLByteCodeMutableState &state) const {
+    return failure();
+  }
+  ArrayRef<PDLByteCodePattern> getPatterns() const { return {}; }
+};
+
+} // namespace mlir::detail
+
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
 #endif // MLIR_REWRITE_BYTECODE_H_
diff --git a/mlir/lib/Rewrite/CMakeLists.txt b/mlir/lib/Rewrite/CMakeLists.txt
index e0395be6cd6f5..a6c39406aa4b3 100644
--- a/mlir/lib/Rewrite/CMakeLists.txt
+++ b/mlir/lib/Rewrite/CMakeLists.txt
@@ -1,5 +1,6 @@
+set(LLVM_OPTIONAL_SOURCES ByteCode.cpp)
+
 add_mlir_library(MLIRRewrite
-  ByteCode.cpp
   FrozenRewritePatternSet.cpp
   PatternApplicator.cpp
 
@@ -11,8 +12,31 @@ add_mlir_library(MLIRRewrite
 
   LINK_LIBS PUBLIC
   MLIRIR
-  MLIRPDLDialect
-  MLIRPDLInterpDialect
-  MLIRPDLToPDLInterp
   MLIRSideEffectInterfaces
   )
+
+if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+  add_mlir_library(MLIRRewritePDL
+    ByteCode.cpp
+
+    ADDITIONAL_HEADER_DIRS
+    ${MLIR_MAIN_INCLUDE_DIR}/mlir/Rewrite
+
+    DEPENDS
+    mlir-generic-headers
+
+    LINK_LIBS PUBLIC
+    MLIRIR
+    MLIRPDLDialect
+    MLIRPDLInterpDialect
+    MLIRPDLToPDLInterp
+    MLIRSideEffectInterfaces
+  )
+
+  target_link_libraries(MLIRRewrite PUBLIC
+    MLIRPDLDialect
+    MLIRPDLInterpDialect
+    MLIRPDLToPDLInterp
+    MLIRRewritePDL)
+endif()
+
diff --git a/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp b/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
index 43840d1e8cec2..17fe02df9f66c 100644
--- a/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
+++ b/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
@@ -8,8 +8,6 @@
 
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "ByteCode.h"
-#include "mlir/Conversion/PDLToPDLInterp/PDLToPDLInterp.h"
-#include "mlir/Dialect/PDL/IR/PDLOps.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
@@ -17,6 +15,11 @@
 
 using namespace mlir;
 
+// Include the PDL rewrite support.
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+#include "mlir/Conversion/PDLToPDLInterp/PDLToPDLInterp.h"
+#include "mlir/Dialect/PDL/IR/PDLOps.h"
+
 static LogicalResult
 convertPDLToPDLInterp(ModuleOp pdlModule,
                       DenseMap<Operation *, PDLPatternConfigSet *> &configMap) {
@@ -48,6 +51,7 @@ convertPDLToPDLInterp(ModuleOp pdlModule,
   pdlModule.getBody()->walk(simplifyFn);
   return success();
 }
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
 //===----------------------------------------------------------------------===//
 // FrozenRewritePatternSet
@@ -121,6 +125,7 @@ FrozenRewritePatternSet::FrozenRewritePatternSet(
     impl->nativeAnyOpPatterns.push_back(std::move(pat));
   }
 
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
   // Generate the bytecode for the PDL patterns if any were provided.
   PDLPatternModule &pdlPatterns = patterns.getPDLPatterns();
   ModuleOp pdlModule = pdlPatterns.getModule();
@@ -137,6 +142,7 @@ FrozenRewritePatternSet::FrozenRewritePatternSet(
       pdlModule, pdlPatterns.takeConfigs(), configMap,
       pdlPatterns.takeConstraintFunctions(),
       pdlPatterns.takeRewriteFunctions());
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 }
 
 FrozenRewritePatternSet::~FrozenRewritePatternSet() = default;
diff --git a/mlir/lib/Rewrite/PatternApplicator.cpp b/mlir/lib/Rewrite/PatternApplicator.cpp
index 08d6ee618ac69..0064eb84aba84 100644
--- a/mlir/lib/Rewrite/PatternApplicator.cpp
+++ b/mlir/lib/Rewrite/PatternApplicator.cpp
@@ -152,7 +152,6 @@ LogicalResult PatternApplicator::matchAndRewrite(
     // Find the next pattern with the highest benefit.
     const Pattern *bestPattern = nullptr;
     unsigned *bestPatternIt = &opIt;
-    const PDLByteCode::MatchResult *pdlMatch = nullptr;
 
     /// Operation specific patterns.
     if (opIt < opE)
@@ -164,6 +163,8 @@ LogicalResult PatternApplicator::matchAndRewrite(
       bestPatternIt = &anyIt;
       bestPattern = anyOpPatterns[anyIt];
     }
+
+    const PDLByteCode::MatchResult *pdlMatch = nullptr;
     /// PDL patterns.
     if (pdlIt < pdlE && (!bestPattern || bestPattern->getBenefit() <
                                              pdlMatches[pdlIt].benefit)) {
@@ -171,6 +172,7 @@ LogicalResult PatternApplicator::matchAndRewrite(
       pdlMatch = &pdlMatches[pdlIt];
       bestPattern = pdlMatch->pattern;
     }
+
     if (!bestPattern)
       break;
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 4d2afe462b928..85433d088dcbf 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Config/mlir-config.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -3312,6 +3313,7 @@ auto ConversionTarget::getOpInfo(OperationName op) const
   return std::nullopt;
 }
 
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 //===----------------------------------------------------------------------===//
 // PDL Configuration
 //===----------------------------------------------------------------------===//
@@ -3382,6 +3384,7 @@ void mlir::registerConversionPDLFunctions(RewritePatternSet &patterns) {
         return std::move(remappedTypes);
       });
 }
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
 //===----------------------------------------------------------------------===//
 // Op Conversion Entry Points
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 3f312164cb1f3..7ec4c8f0963a2 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -97,16 +97,13 @@ set(MLIR_TEST_DEPENDS
   mlir-capi-ir-test
   mlir-capi-llvm-test
   mlir-capi-pass-test
-  mlir-capi-pdl-test
   mlir-capi-quant-test
   mlir-capi-sparse-tensor-test
   mlir-capi-transform-test
   mlir-capi-translation-test
   mlir-linalg-ods-yaml-gen
   mlir-lsp-server
-  mlir-pdll-lsp-server
   mlir-opt
-  mlir-pdll
   mlir-query
   mlir-reduce
   mlir-tblgen
@@ -115,6 +112,12 @@ set(MLIR_TEST_DEPENDS
   tblgen-to-irdl
   )
 
+set(MLIR_TEST_DEPENDS ${MLIR_TEST_DEPENDS}
+  mlir-capi-pdl-test
+  mlir-pdll-lsp-server
+  mlir-pdll
+  )
+
 # The native target may not be enabled, in this case we won't
 # run tests that involves executing on the host: do not build
 # useless binaries.
@@ -159,9 +162,10 @@ if(LLVM_BUILD_EXAMPLES)
     toyc-ch3
     toyc-ch4
     toyc-ch5
+    )
+  list(APPEND MLIR_TEST_DEPENDS
     transform-opt-ch2
     transform-opt-ch3
-    mlir-minimal-opt
     )
   if(MLIR_ENABLE_EXECUTION_ENGINE)
     list(APPEND MLIR_TEST_DEPENDS
diff --git a/mlir/test/lib/Rewrite/CMakeLists.txt b/mlir/test/lib/Rewrite/CMakeLists.txt
index fd5d5d5861601..f8926aa8e4870 100644
--- a/mlir/test/lib/Rewrite/CMakeLists.txt
+++ b/mlir/test/lib/Rewrite/CMakeLists.txt
@@ -1,16 +1,17 @@
-# Exclude tests from libMLIR.so
-add_mlir_library(MLIRTestRewrite
-  TestPDLByteCode.cpp
+if (MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+  # Exclude tests from libMLIR.so
+  add_mlir_library(MLIRTestRewrite
+    TestPDLByteCode.cpp
 
-  EXCLUDE_FROM_LIBMLIR
+    EXCLUDE_FROM_LIBMLIR
 
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Rewrite
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRPass
-  MLIRSupport
-  MLIRTransformUtils
-  )
+    ADDITIONAL_HEADER_DIRS
+    ${MLIR_MAIN_INCLUDE_DIR}/mlir/Rewrite
 
+    LINK_LIBS PUBLIC
+    MLIRIR
+    MLIRPass
+    MLIRSupport
+    MLIRTransformUtils
+    )
+endif()
diff --git a/mlir/test/lib/Tools/PDLL/CMakeLists.txt b/mlir/test/lib/Tools/PDLL/CMakeLists.txt
index 5ad13062e2a6c..408c29ea35ec3 100644
--- a/mlir/test/lib/Tools/PDLL/CMakeLists.txt
+++ b/mlir/test/lib/Tools/PDLL/CMakeLists.txt
@@ -23,6 +23,8 @@ add_mlir_library(MLIRTestPDLL
   MLIRCastInterfaces
   MLIRIR
   MLIRPass
+  MLIRPDLInterpDialect
+  MLIRPDLDialect
   MLIRSupport
   MLIRTestDialect
   MLIRTransformUtils
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index e032ce7200fbf..2a3a8608db544 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,3 +1,8 @@
+set(LLVM_OPTIONAL_SOURCES 
+  TestDialectConversion.cpp)
+set(MLIRTestTransformsPDLDep)
+set(MLIRTestTransformsPDLSrc)
+if(MLIR_ENABLE_PDL_IN_PATTERNMATCH)
 add_mlir_pdll_library(MLIRTestDialectConversionPDLLPatternsIncGen
   TestDialectConversion.pdll
   TestDialectConversionPDLLPatterns.h.inc
@@ -6,17 +11,22 @@ add_mlir_pdll_library(MLIRTestDialectConversionPDLLPatternsIncGen
   ${CMAKE_CURRENT_SOURCE_DIR}/../Dialect/Test
   ${CMAKE_CURRENT_BINARY_DIR}/../Dialect/Test
   )
+  set(MLIRTestTransformsPDLSrc
+    TestDialectConversion.cpp)
+  set(MLIRTestTransformsPDLDep
+    MLIRTestDialectConversionPDLLPatternsIncGen)
+endif()
 
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestTransforms
   TestCommutativityUtils.cpp
   TestConstantFold.cpp
   TestControlFlowSink.cpp
-  TestDialectConversion.cpp
   TestInlining.cpp
   TestIntRangeInference.cpp
   TestMakeIsolatedFromAbove.cpp
   TestTopologicalSort.cpp
+  ${MLIRTestTransformsPDLSrc}
 
   EXCLUDE_FROM_LIBMLIR
 
@@ -24,7 +34,7 @@ add_mlir_library(MLIRTestTransforms
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
 
   DEPENDS
-  MLIRTestDialectConversionPDLLPatternsIncGen
+  ${MLIRTestTransformsPDLDep}
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
diff --git a/mlir/tools/mlir-lsp-server/CMakeLists.txt b/mlir/tools/mlir-lsp-server/CMakeLists.txt
index e90ccf17af17f..0134b54eef1b0 100644
--- a/mlir/tools/mlir-lsp-server/CMakeLists.txt
+++ b/mlir/tools/mlir-lsp-server/CMakeLists.txt
@@ -21,10 +21,17 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestIR
     MLIRTestPass
     MLIRTestReducer
-    MLIRTestRewrite
-    MLIRTestTransformDialect
-    MLIRTestTransforms
     )
+  set(test_libs
+    ${test_libs}
+    MLIRTestTransformDialect
+    MLIRTestTransforms)
+
+  if (MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+    set(test_libs
+      ${test_libs}
+      MLIRTestRewrite)
+  endif()
 endif()
 
 set(LIBS
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index b6ada66d32188..ce2f5bf4094a5 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -38,16 +38,24 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestIR
     MLIRTestOneToNTypeConversionPass
     MLIRTestPass
-    MLIRTestPDLL
     MLIRTestReducer
-    MLIRTestRewrite
-    MLIRTestTransformDialect
     MLIRTestTransforms
     MLIRTilingInterfaceTestPasses
     MLIRVectorTestPasses
     MLIRTestVectorToSPIRV
     MLIRLLVMTestPasses
     )
+  set(test_libs ${test_libs}
+    MLIRTestPDLL
+    MLIRTestTransformDialect
+    )
+
+  if (MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+    set(test_libs ${test_libs}
+      MLIRTestPDLL
+      MLIRTestRewrite
+      )
+  endif()
 endif()
 
 set(LIBS
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index f7a5b3183b50b..3fce7a7eea9ec 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -85,7 +85,6 @@ void registerTestDataLayoutQuery();
 void registerTestDeadCodeAnalysisPass();
 void registerTestDecomposeCallGraphTypes();
 void registerTestDiagnosticsPass();
-void registerTestDialectConversionPasses();
 void registerTestDominancePass();
 void registerTestDynamicPipelinePass();
 void registerTestEmulateNarrowTypePass();
@@ -124,8 +123,6 @@ void registerTestNextAccessPass();
 void registerTestOneToNTypeConversionPass();
 void registerTestOpaqueLoc();
 void registerTestPadFusion();
-void registerTestPDLByteCodePass();
-void registerTestPDLLPasses();
 void registerTestPreparationPassWithAllowedMemrefResults();
 void registerTestRecursiveTypesPass();
 void registerTestSCFUtilsPass();
@@ -142,13 +139,18 @@ void registerTestWrittenToPass();
 void registerTestVectorLowerings();
 void registerTestVectorReductionToSPIRVDotProd();
 void registerTestNvgpuLowerings();
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+void registerTestDialectConversionPasses();
+void registerTestPDLByteCodePass();
+void registerTestPDLLPasses();
+#endif
 } // namespace test
 } // namespace mlir
 
 namespace test {
 void registerTestDialect(DialectRegistry &);
-void registerTestTransformDialectExtension(DialectRegistry &);
 void registerTestDynDialect(DialectRegistry &);
+void registerTestTransformDialectExtension(DialectRegistry &);
 } // namespace test
 
 #ifdef MLIR_INCLUDE_TESTS
@@ -202,7 +204,6 @@ void registerTestPasses() {
   mlir::test::registerTestConstantFold();
   mlir::test::registerTestControlFlowSink();
   mlir::test::registerTestDiagnosticsPass();
-  mlir::test::registerTestDialectConversionPasses();
   mlir::test::registerTestDecomposeCallGraphTypes();
   mlir::test::registerTestDataLayoutPropagation();
   mlir::test::registerTestDataLayoutQuery();
@@ -243,8 +244,6 @@ void registerTestPasses() {
   mlir::test::registerTestOneToNTypeConversionPass();
   mlir::test::registerTestOpaqueLoc();
   mlir::test::registerTestPadFusion();
-  mlir::test::registerTestPDLByteCodePass();
-  mlir::test::registerTestPDLLPasses();
   mlir::test::registerTestRecursiveTypesPass();
   mlir::test::registerTestSCFUtilsPass();
   mlir::test::registerTestSCFWhileOpBuilderPass();
@@ -260,6 +259,11 @@ void registerTestPasses() {
   mlir::test::registerTestVectorReductionToSPIRVDotProd();
   mlir::test::registerTestNvgpuLowerings();
   mlir::test::registerTestWrittenToPass();
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+  mlir::test::registerTestDialectConversionPasses();
+  mlir::test::registerTestPDLByteCodePass();
+  mlir::test::registerTestPDLLPasses();
+#endif
 }
 #endif
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2a56b2d6f0373..2a72bf965e544 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -35,6 +35,7 @@ expand_template(
     substitutions = {
         "#cmakedefine01 MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS": "#define MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS 0",
         "#cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}": "/* #undef MLIR_GREEDY_REWRITE_RANDOMIZER_SEED */",
+        "#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH": "#define MLIR_ENABLE_PDL_IN_PATTERNMATCH 1",
     },
     template = "include/mlir/Config/mlir-config.h.cmake",
 )
@@ -318,11 +319,13 @@ cc_library(
     srcs = glob([
         "lib/IR/*.cpp",
         "lib/IR/*.h",
+        "lib/IR/PDL/*.cpp",
         "lib/Bytecode/Reader/*.h",
         "lib/Bytecode/Writer/*.h",
         "lib/Bytecode/*.h",
     ]) + [
         "lib/Bytecode/BytecodeOpInterface.cpp",
+        "include/mlir/IR/PDLPatternMatch.h.inc",
     ],
     hdrs = glob([
         "include/mlir/IR/*.h",
@@ -345,6 +348,7 @@ cc_library(
         ":BuiltinTypesIncGen",
         ":BytecodeOpInterfaceIncGen",
         ":CallOpInterfacesIncGen",
+        ":config",
         ":DataLayoutInterfacesIncGen",
         ":InferTypeOpInterfaceIncGen",
         ":OpAsmInterfaceIncGen",

From 42ec976184acd40436acd7104ad715c60ca3e7ed Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Wed, 3 Jan 2024 21:12:28 -0800
Subject: [PATCH 190/313] [clang-format] Optimize processing
 .clang-format-ignore files (#76733)

Reuse the patterns governing the previous input file being formatted if
the current input file is from the same directory.
---
 clang/docs/ClangFormat.rst                |  6 ++-
 clang/test/Format/clang-format-ignore.cpp | 25 ++++++---
 clang/tools/clang-format/ClangFormat.cpp  | 65 ++++++++++++++++-------
 3 files changed, 69 insertions(+), 27 deletions(-)

diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index 8d4017b29fb8e..819d9ee9f9cde 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -131,6 +131,9 @@ An easy way to create the ``.clang-format`` file is:
 
 Available style options are described in :doc:`ClangFormatStyleOptions`.
 
+.clang-format-ignore
+====================
+
 You can create ``.clang-format-ignore`` files to make ``clang-format`` ignore
 certain files. A ``.clang-format-ignore`` file consists of patterns of file path
 names. It has the following format:
@@ -141,7 +144,8 @@ names. It has the following format:
 * A non-comment line is a single pattern.
 * The slash (``/``) is used as the directory separator.
 * A pattern is relative to the directory of the ``.clang-format-ignore`` file
-  (or the root directory if the pattern starts with a slash).
+  (or the root directory if the pattern starts with a slash). Patterns
+  containing drive names (e.g. ``C:``) are not supported.
 * Patterns follow the rules specified in `POSIX 2.13.1, 2.13.2, and Rule 1 of
   2.13.3 <https://pubs.opengroup.org/onlinepubs/9699919799/utilities/
   V3_chap02.html#tag_18_13>`_.
diff --git a/clang/test/Format/clang-format-ignore.cpp b/clang/test/Format/clang-format-ignore.cpp
index 0d6396a64a668..5a2267b302d22 100644
--- a/clang/test/Format/clang-format-ignore.cpp
+++ b/clang/test/Format/clang-format-ignore.cpp
@@ -21,13 +21,26 @@
 
 // RUN: touch .clang-format-ignore
 // RUN: clang-format -verbose foo.c foo.js 2> %t.stderr
-// RUN: grep "Formatting \[1/2] foo.c" %t.stderr
-// RUN: grep "Formatting \[2/2] foo.js" %t.stderr
+// RUN: grep -Fx "Formatting [1/2] foo.c" %t.stderr
+// RUN: grep -Fx "Formatting [2/2] foo.js" %t.stderr
 
 // RUN: echo "*.js" > .clang-format-ignore
 // RUN: clang-format -verbose foo.c foo.js 2> %t.stderr
-// RUN: grep "Formatting \[1/2] foo.c" %t.stderr
-// RUN: not grep "Formatting \[2/2] foo.js" %t.stderr
+// RUN: grep -Fx "Formatting [1/2] foo.c" %t.stderr
+// RUN: not grep -F foo.js %t.stderr
 
-// RUN: cd ../../..
-// RUN: rm -rf %t.dir
+// RUN: cd ../..
+// RUN: clang-format -verbose *.cc level1/*.c* level1/level2/foo.* 2> %t.stderr
+// RUN: grep -x "Formatting \[1/5] .*foo\.c" %t.stderr
+// RUN: not grep -F foo.js %t.stderr
+
+// RUN: rm .clang-format-ignore
+// RUN: clang-format -verbose *.cc level1/*.c* level1/level2/foo.* 2> %t.stderr
+// RUN: grep -x "Formatting \[1/5] .*foo\.cc" %t.stderr
+// RUN: grep -x "Formatting \[2/5] .*bar\.cc" %t.stderr
+// RUN: grep -x "Formatting \[3/5] .*baz\.c" %t.stderr
+// RUN: grep -x "Formatting \[4/5] .*foo\.c" %t.stderr
+// RUN: not grep -F foo.js %t.stderr
+
+// RUN: cd ..
+// RUN: rm -r %t.dir
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index be34dbbe886a1..787e56a6eccc0 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -571,6 +571,11 @@ static int dumpConfig(bool IsSTDIN) {
   return 0;
 }
 
+using String = SmallString<128>;
+static String IgnoreDir;             // Directory of .clang-format-ignore file.
+static StringRef PrevDir;            // Directory of previous `FilePath`.
+static SmallVector<String> Patterns; // Patterns in .clang-format-ignore file.
+
 // Check whether `FilePath` is ignored according to the nearest
 // .clang-format-ignore file based on the rules below:
 // - A blank line is skipped.
@@ -586,33 +591,50 @@ static bool isIgnored(StringRef FilePath) {
   if (!is_regular_file(FilePath))
     return false;
 
-  using namespace llvm::sys::path;
-  SmallString<128> Path, AbsPath{FilePath};
+  String Path;
+  String AbsPath{FilePath};
 
+  using namespace llvm::sys::path;
   make_absolute(AbsPath);
   remove_dots(AbsPath, /*remove_dot_dot=*/true);
 
-  StringRef IgnoreDir{AbsPath};
-  do {
-    IgnoreDir = parent_path(IgnoreDir);
-    if (IgnoreDir.empty())
+  if (StringRef Dir{parent_path(AbsPath)}; PrevDir != Dir) {
+    PrevDir = Dir;
+
+    for (;;) {
+      Path = Dir;
+      append(Path, ".clang-format-ignore");
+      if (is_regular_file(Path))
+        break;
+      Dir = parent_path(Dir);
+      if (Dir.empty())
+        return false;
+    }
+
+    IgnoreDir = convert_to_slash(Dir);
+
+    std::ifstream IgnoreFile{Path.c_str()};
+    if (!IgnoreFile.good())
       return false;
 
-    Path = IgnoreDir;
-    append(Path, ".clang-format-ignore");
-  } while (!is_regular_file(Path));
+    Patterns.clear();
 
-  std::ifstream IgnoreFile{Path.c_str()};
-  if (!IgnoreFile.good())
-    return false;
+    for (std::string Line; std::getline(IgnoreFile, Line);) {
+      if (const auto Pattern{StringRef{Line}.trim()};
+          // Skip empty and comment lines.
+          !Pattern.empty() && Pattern[0] != '#') {
+        Patterns.push_back(Pattern);
+      }
+    }
+  }
 
-  const auto Pathname = convert_to_slash(AbsPath);
-  for (std::string Line; std::getline(IgnoreFile, Line);) {
-    auto Pattern = StringRef(Line).trim();
-    if (Pattern.empty() || Pattern[0] == '#')
-      continue;
+  if (IgnoreDir.empty())
+    return false;
 
-    const bool IsNegated = Pattern[0] == '!';
+  const auto Pathname{convert_to_slash(AbsPath)};
+  for (const auto &Pat : Patterns) {
+    const bool IsNegated = Pat[0] == '!';
+    StringRef Pattern{Pat};
     if (IsNegated)
       Pattern = Pattern.drop_front();
 
@@ -620,11 +642,14 @@ static bool isIgnored(StringRef FilePath) {
       continue;
 
     Pattern = Pattern.ltrim();
+
+    // `Pattern` is relative to `IgnoreDir` unless it starts with a slash.
+    // This doesn't support patterns containing drive names (e.g. `C:`).
     if (Pattern[0] != '/') {
-      Path = convert_to_slash(IgnoreDir);
+      Path = IgnoreDir;
       append(Path, Style::posix, Pattern);
       remove_dots(Path, /*remove_dot_dot=*/true, Style::posix);
-      Pattern = Path.str();
+      Pattern = Path;
     }
 
     if (clang::format::matchFilePath(Pattern, Pathname) == !IsNegated)

From 80889ae0297453476f600d6a8cf776f272dac461 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 3 Jan 2024 21:31:07 -0800
Subject: [PATCH 191/313] [RISCV] Remove RISCVISD::VSELECT_VL. (#76866)

We can use RISCVISD::VMERGE_VL with an undef passthru operand.

I had to rewrite the FMA patterns to handle both undef and non-undef
cases so we can get the tail policy.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  66 +++++------
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   3 +-
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 103 ++++--------------
 3 files changed, 58 insertions(+), 114 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27bb69dc9868c..c8a94adcd91c6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2811,8 +2811,8 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
   SDValue SplatZero = DAG.getNode(
       RISCVISD::VMV_V_X_VL, DL, DstContainerVT, DAG.getUNDEF(DstContainerVT),
       DAG.getConstant(0, DL, Subtarget.getXLenVT()), VL);
-  Res = DAG.getNode(RISCVISD::VSELECT_VL, DL, DstContainerVT, IsNan, SplatZero,
-                    Res, VL);
+  Res = DAG.getNode(RISCVISD::VMERGE_VL, DL, DstContainerVT, IsNan, SplatZero,
+                    Res, DAG.getUNDEF(DstContainerVT), VL);
 
   if (DstVT.isFixedLengthVector())
     Res = convertFromScalableVector(DstVT, Res, DAG, Subtarget);
@@ -5401,8 +5401,8 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG,
     SDValue XIsNonNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),
                                     {X, X, DAG.getCondCode(ISD::SETOEQ),
                                      DAG.getUNDEF(ContainerVT), Mask, VL});
-    NewY =
-        DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, XIsNonNan, Y, X, VL);
+    NewY = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, XIsNonNan, Y, X,
+                       DAG.getUNDEF(ContainerVT), VL);
   }
 
   SDValue NewX = X;
@@ -5410,8 +5410,8 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG,
     SDValue YIsNonNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),
                                     {Y, Y, DAG.getCondCode(ISD::SETOEQ),
                                      DAG.getUNDEF(ContainerVT), Mask, VL});
-    NewX =
-        DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, YIsNonNan, X, Y, VL);
+    NewX = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, YIsNonNan, X, Y,
+                       DAG.getUNDEF(ContainerVT), VL);
   }
 
   unsigned Opc =
@@ -5528,7 +5528,6 @@ static unsigned getRISCVVLOp(SDValue Op) {
       return RISCVISD::VMXOR_VL;
     return RISCVISD::XOR_VL;
   case ISD::VP_SELECT:
-    return RISCVISD::VSELECT_VL;
   case ISD::VP_MERGE:
     return RISCVISD::VMERGE_VL;
   case ISD::VP_ASHR:
@@ -5563,7 +5562,7 @@ static bool hasMergeOp(unsigned Opcode) {
          Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE &&
          "not a RISC-V target specific op");
   static_assert(RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP ==
-                    125 &&
+                    124 &&
                 RISCVISD::LAST_RISCV_STRICTFP_OPCODE -
                         ISD::FIRST_TARGET_STRICTFP_OPCODE ==
                     21 &&
@@ -5589,7 +5588,7 @@ static bool hasMaskOp(unsigned Opcode) {
          Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE &&
          "not a RISC-V target specific op");
   static_assert(RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP ==
-                    125 &&
+                    124 &&
                 RISCVISD::LAST_RISCV_STRICTFP_OPCODE -
                         ISD::FIRST_TARGET_STRICTFP_OPCODE ==
                     21 &&
@@ -7456,8 +7455,9 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
                           DAG.getUNDEF(ContainerVT), SplatZero, VL);
   SplatTrueVal = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
                              DAG.getUNDEF(ContainerVT), SplatTrueVal, VL);
-  SDValue Select = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC,
-                               SplatTrueVal, SplatZero, VL);
+  SDValue Select =
+      DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, SplatTrueVal,
+                  SplatZero, DAG.getUNDEF(ContainerVT), VL);
 
   return convertFromScalableVector(VecVT, Select, DAG, Subtarget);
 }
@@ -8240,8 +8240,8 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
       return Vec;
     // TAMU
     if (Policy == RISCVII::TAIL_AGNOSTIC)
-      return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, Mask, Vec, MaskedOff,
-                         AVL);
+      return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff,
+                         DAG.getUNDEF(VT), AVL);
     // TUMA or TUMU: Currently we always emit tumu policy regardless of tuma.
     // It's fine because vmerge does not care mask policy.
     return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff,
@@ -8489,8 +8489,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT,
                     {VID, SplattedIdx, DAG.getCondCode(ISD::SETEQ),
                      DAG.getUNDEF(MaskVT), Mask, VL});
-    return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, SelectCond, SplattedVal,
-                       Vec, VL);
+    return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, SelectCond, SplattedVal,
+                       Vec, DAG.getUNDEF(VT), VL);
   }
   // EGS * EEW >= 128 bits
   case Intrinsic::riscv_vaesdf_vv:
@@ -10243,8 +10243,8 @@ SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV(
   SDLoc DL(Op);
   SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
 
-  SDValue Select =
-      DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC, Op1, Op2, VL);
+  SDValue Select = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, Op1,
+                               Op2, DAG.getUNDEF(ContainerVT), VL);
 
   return convertFromScalableVector(VT, Select, DAG, Subtarget);
 }
@@ -10327,9 +10327,14 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {
           Ops.push_back(DAG.getUNDEF(ContainerVT));
       } else if (ISD::getVPExplicitVectorLengthIdx(Op.getOpcode()) ==
                  OpIdx.index()) {
-        // For VP_MERGE, copy the false operand instead of an undef value.
-        assert(Op.getOpcode() == ISD::VP_MERGE);
-        Ops.push_back(Ops.back());
+        if (Op.getOpcode() == ISD::VP_MERGE) {
+          // For VP_MERGE, copy the false operand instead of an undef value.
+          Ops.push_back(Ops.back());
+        } else {
+          assert(Op.getOpcode() == ISD::VP_SELECT);
+          // For VP_SELECT, add an undef value.
+          Ops.push_back(DAG.getUNDEF(ContainerVT));
+        }
       }
     }
     // Pass through operands which aren't fixed-length vectors.
@@ -10379,8 +10384,8 @@ SDValue RISCVTargetLowering::lowerVPExtMaskOp(SDValue Op,
   SDValue Splat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
                               DAG.getUNDEF(ContainerVT), SplatValue, VL);
 
-  SDValue Result = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Src,
-                               Splat, ZeroSplat, VL);
+  SDValue Result = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Src, Splat,
+                               ZeroSplat, DAG.getUNDEF(ContainerVT), VL);
   if (!VT.isFixedLengthVector())
     return Result;
   return convertFromScalableVector(VT, Result, DAG, Subtarget);
@@ -10508,8 +10513,8 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,
             RISCVISDExtOpc == RISCVISD::VZEXT_VL ? 1 : -1, DL, XLenVT);
         SDValue OneSplat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT,
                                        DAG.getUNDEF(IntVT), One, VL);
-        Src = DAG.getNode(RISCVISD::VSELECT_VL, DL, IntVT, Src, OneSplat,
-                          ZeroSplat, VL);
+        Src = DAG.getNode(RISCVISD::VMERGE_VL, DL, IntVT, Src, OneSplat,
+                          ZeroSplat, DAG.getUNDEF(IntVT), VL);
       } else if (DstEltSize > (2 * SrcEltSize)) {
         // Widen before converting.
         MVT IntVT = MVT::getVectorVT(MVT::getIntegerVT(DstEltSize / 2),
@@ -10633,8 +10638,8 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
     SDValue SplatZeroOp1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
                                        DAG.getUNDEF(ContainerVT),
                                        DAG.getConstant(0, DL, XLenVT), EVL1);
-    Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op1, SplatOneOp1,
-                      SplatZeroOp1, EVL1);
+    Op1 = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Op1, SplatOneOp1,
+                      SplatZeroOp1, DAG.getUNDEF(ContainerVT), EVL1);
 
     SDValue SplatOneOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
                                       DAG.getUNDEF(ContainerVT),
@@ -10642,8 +10647,8 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
     SDValue SplatZeroOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
                                        DAG.getUNDEF(ContainerVT),
                                        DAG.getConstant(0, DL, XLenVT), EVL2);
-    Op2 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op2, SplatOneOp2,
-                      SplatZeroOp2, EVL2);
+    Op2 = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Op2, SplatOneOp2,
+                      SplatZeroOp2, DAG.getUNDEF(ContainerVT), EVL2);
   }
 
   int64_t ImmValue = cast<ConstantSDNode>(Offset)->getSExtValue();
@@ -10713,8 +10718,8 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op,
     SDValue SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IndicesVT,
                                     DAG.getUNDEF(IndicesVT),
                                     DAG.getConstant(0, DL, XLenVT), EVL);
-    Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, IndicesVT, Op1, SplatOne,
-                      SplatZero, EVL);
+    Op1 = DAG.getNode(RISCVISD::VMERGE_VL, DL, IndicesVT, Op1, SplatOne,
+                      SplatZero, DAG.getUNDEF(IndicesVT), EVL);
   }
 
   unsigned EltSize = GatherVT.getScalarSizeInBits();
@@ -18683,7 +18688,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VWMACCSU_VL)
   NODE_NAME_CASE(VNSRL_VL)
   NODE_NAME_CASE(SETCC_VL)
-  NODE_NAME_CASE(VSELECT_VL)
   NODE_NAME_CASE(VMERGE_VL)
   NODE_NAME_CASE(VMAND_VL)
   NODE_NAME_CASE(VMOR_VL)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 58ed611efc83d..eacae8a700584 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -330,9 +330,8 @@ enum NodeType : unsigned {
   // operand is VL.
   SETCC_VL,
 
-  // Vector select with an additional VL operand. This operation is unmasked.
-  VSELECT_VL,
   // General vmerge node with mask, true, false, passthru, and vl operands.
+  // Tail agnostic vselect can be implemented by setting passthru to undef.
   VMERGE_VL,
 
   // Mask binary operators.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 5b50a4a78c018..ca9e37b9144b7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -338,13 +338,6 @@ def riscv_vrgatherei16_vv_vl : SDNode<"RISCVISD::VRGATHEREI16_VV_VL",
                                                            SDTCisSameNumEltsAs<0, 4>,
                                                            SDTCisVT<5, XLenVT>]>>;
 
-def SDT_RISCVSelect_VL  : SDTypeProfile<1, 4, [
-  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,
-  SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisVT<4, XLenVT>
-]>;
-
-def riscv_vselect_vl  : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>;
-
 def SDT_RISCVVMERGE_VL  : SDTypeProfile<1, 5, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,
   SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameAs<0, 4>,
@@ -1722,21 +1715,21 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
               (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vselect_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
                                 srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(riscv_vselect_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
                                 srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -1861,17 +1854,17 @@ multiclass VPatFPMulAccVL_VV_VF<PatFrag vop, string instruction_name> {
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vselect_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(riscv_vselect_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -1905,10 +1898,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vselect_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0),
@@ -1916,10 +1909,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(riscv_vselect_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0),
@@ -2255,31 +2248,6 @@ foreach vtiTowti = AllWidenableIntVectors in {
 // 11.15. Vector Integer Merge Instructions
 foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                            vti.RegClass:$rs1,
-                                            vti.RegClass:$rs2,
-                                            VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask V0),
-                   GPR:$vl, vti.Log2SEW)>;
-
-    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                            (SplatPat XLenVT:$rs1),
-                                            vti.RegClass:$rs2,
-                                            VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs2, GPR:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
-
-    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
-                                            (SplatPat_simm5 simm5:$rs1),
-                                            vti.RegClass:$rs2,
-                                            VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
-
     def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
                                            vti.RegClass:$rs1,
                                            vti.RegClass:$rs2,
@@ -2534,33 +2502,6 @@ foreach fvti = AllFloatVectors in {
   // 13.15. Vector Floating-Point Merge Instruction
   defvar ivti = GetIntVTypeInfo<fvti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
-    def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
-                                             fvti.RegClass:$rs1,
-                                             fvti.RegClass:$rs2,
-                                             VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
-                   (fvti.Vector (IMPLICIT_DEF)),
-                   fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
-                   GPR:$vl, fvti.Log2SEW)>;
-
-    def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
-                                             (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))),
-                                             fvti.RegClass:$rs2,
-                                             VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
-                   (fvti.Vector (IMPLICIT_DEF)),
-                   fvti.RegClass:$rs2,
-                   GPR:$imm,
-                   (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
-
-    def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
-                                             (SplatFPOp (fvti.Scalar fpimm0)),
-                                             fvti.RegClass:$rs2,
-                                             VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
-                   (fvti.Vector (IMPLICIT_DEF)),
-                   fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
-
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                           fvti.RegClass:$rs1,
                                           fvti.RegClass:$rs2,
@@ -2570,6 +2511,16 @@ foreach fvti = AllFloatVectors in {
                  fvti.RegClass:$merge, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
                  GPR:$vl, fvti.Log2SEW)>;
 
+  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+                                          (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))),
+                                          fvti.RegClass:$rs2,
+                                          fvti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
+                 fvti.RegClass:$merge, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0),
+                 GPR:$vl, fvti.Log2SEW)>;
+
+
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                           (SplatFPOp (fvti.Scalar fpimm0)),
                                           fvti.RegClass:$rs2,
@@ -2581,16 +2532,6 @@ foreach fvti = AllFloatVectors in {
   }
 
   let Predicates = GetVTypePredicates<fvti>.Predicates in {
-    def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
-                                             (SplatFPOp fvti.ScalarRegClass:$rs1),
-                                             fvti.RegClass:$rs2,
-                                             VLOpFrag)),
-              (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
-                   (fvti.Vector (IMPLICIT_DEF)),
-                   fvti.RegClass:$rs2,
-                   (fvti.Scalar fvti.ScalarRegClass:$rs1),
-                   (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
-
     def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                             (SplatFPOp fvti.ScalarRegClass:$rs1),
                                             fvti.RegClass:$rs2,

From 0ce193708cb682f3ad742ad909d8a5346a505ac7 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 4 Jan 2024 15:11:28 +0800
Subject: [PATCH 192/313] [InstCombine] Refactor folding of commutative binops
 over select/phi/minmax (#76692)

This patch cleans up the duplicate code for folding commutative binops
over `select/phi/minmax`.

Related commits:
+ select support:
https://github.com/llvm/llvm-project/commit/88cc35b27e6c7966ab2463fa06d3dd970e88df64
+ phi support:
https://github.com/llvm/llvm-project/commit/8674a023bcacb677ce48b8831e2ae35b5aa2d8ef
+ minmax support:
https://github.com/llvm/llvm-project/commit/624973806c5644ccfa84805319b5852edb68d48d
---
 .../InstCombine/InstCombineAddSub.cpp         |  7 --
 .../InstCombine/InstCombineCalls.cpp          | 46 ++---------
 .../InstCombine/InstCombineInternal.h         | 23 ++----
 .../InstCombine/InstCombineMulDivRem.cpp      |  7 --
 .../InstCombine/InstructionCombining.cpp      | 77 +++++++++++--------
 .../InstCombine/minmax-of-minmax.ll           |  4 +-
 6 files changed, 58 insertions(+), 106 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 556fde37efeb2..96b612254ca50 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1666,13 +1666,6 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *Ashr = foldAddToAshr(I))
     return Ashr;
 
-  // min(A, B) + max(A, B) => A + B.
-  if (match(&I, m_CombineOr(m_c_Add(m_SMax(m_Value(A), m_Value(B)),
-                                    m_c_SMin(m_Deferred(A), m_Deferred(B))),
-                            m_c_Add(m_UMax(m_Value(A), m_Value(B)),
-                                    m_c_UMin(m_Deferred(A), m_Deferred(B))))))
-    return BinaryOperator::CreateWithCopiedFlags(Instruction::Add, A, B, &I);
-
   // (~X) + (~Y) --> -2 - (X + Y)
   {
     // To ensure we can save instructions we need to ensure that we consume both
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 43d4496571be5..3da9b89a6409c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1536,11 +1536,11 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   }
 
   if (II->isCommutative()) {
-    if (Instruction *I = foldCommutativeIntrinsicOverSelects(*II))
-      return I;
-
-    if (Instruction *I = foldCommutativeIntrinsicOverPhis(*II))
-      return I;
+    if (auto Pair = matchSymmetricPair(II->getOperand(0), II->getOperand(1))) {
+      replaceOperand(*II, 0, Pair->first);
+      replaceOperand(*II, 1, Pair->second);
+      return II;
+    }
 
     if (CallInst *NewCall = canonicalizeConstantArg0ToArg1(CI))
       return NewCall;
@@ -4246,39 +4246,3 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   Call.setCalledFunction(FTy, NestF);
   return &Call;
 }
-
-// op(select(%v, %x, %y), select(%v, %y, %x)) --> op(%x, %y)
-Instruction *
-InstCombinerImpl::foldCommutativeIntrinsicOverSelects(IntrinsicInst &II) {
-  assert(II.isCommutative());
-
-  Value *A, *B, *C;
-  if (match(II.getOperand(0), m_Select(m_Value(A), m_Value(B), m_Value(C))) &&
-      match(II.getOperand(1),
-            m_Select(m_Specific(A), m_Specific(C), m_Specific(B)))) {
-    replaceOperand(II, 0, B);
-    replaceOperand(II, 1, C);
-    return &II;
-  }
-
-  return nullptr;
-}
-
-Instruction *
-InstCombinerImpl::foldCommutativeIntrinsicOverPhis(IntrinsicInst &II) {
-  assert(II.isCommutative() && "Instruction should be commutative");
-
-  PHINode *LHS = dyn_cast<PHINode>(II.getOperand(0));
-  PHINode *RHS = dyn_cast<PHINode>(II.getOperand(1));
-
-  if (!LHS || !RHS)
-    return nullptr;
-
-  if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) {
-    replaceOperand(II, 0, P->first);
-    replaceOperand(II, 1, P->second);
-    return &II;
-  }
-
-  return nullptr;
-}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index bdaf7550b4b42..21c61bd990184 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -276,17 +276,15 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   bool transformConstExprCastCall(CallBase &Call);
   Instruction *transformCallThroughTrampoline(CallBase &Call,
                                               IntrinsicInst &Tramp);
-  Instruction *foldCommutativeIntrinsicOverSelects(IntrinsicInst &II);
 
-  // Match a pair of Phi Nodes like
-  // phi [a, BB0], [b, BB1] & phi [b, BB0], [a, BB1]
-  // Return the matched two operands.
-  std::optional<std::pair<Value *, Value *>>
-  matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS);
-
-  // Tries to fold (op phi(a, b) phi(b, a)) -> (op a, b)
-  // while op is a commutative intrinsic call.
-  Instruction *foldCommutativeIntrinsicOverPhis(IntrinsicInst &II);
+  // Return (a, b) if (LHS, RHS) is known to be (a, b) or (b, a).
+  // Otherwise, return std::nullopt
+  // Currently it matches:
+  // - LHS = (select c, a, b), RHS = (select c, b, a)
+  // - LHS = (phi [a, BB0], [b, BB1]), RHS = (phi [b, BB0], [a, BB1])
+  // - LHS = min(a, b), RHS = max(a, b)
+  std::optional<std::pair<Value *, Value *>> matchSymmetricPair(Value *LHS,
+                                                                Value *RHS);
 
   Value *simplifyMaskedLoad(IntrinsicInst &II);
   Instruction *simplifyMaskedStore(IntrinsicInst &II);
@@ -502,11 +500,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// X % (C0 * C1)
   Value *SimplifyAddWithRemainder(BinaryOperator &I);
 
-  // Tries to fold (Binop phi(a, b) phi(b, a)) -> (Binop a, b)
-  // while Binop is commutative.
-  Value *SimplifyPhiCommutativeBinaryOp(BinaryOperator &I, Value *LHS,
-                                        Value *RHS);
-
   // Binary Op helper for select operations where the expression can be
   // efficiently reorganized.
   Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f0ea3d9fcad5d..e7f983a00e304 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -487,13 +487,6 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (Instruction *Res = foldBinOpOfSelectAndCastOfSelectCondition(I))
     return Res;
 
-  // min(X, Y) * max(X, Y) => X * Y.
-  if (match(&I, m_CombineOr(m_c_Mul(m_SMax(m_Value(X), m_Value(Y)),
-                                    m_c_SMin(m_Deferred(X), m_Deferred(Y))),
-                            m_c_Mul(m_UMax(m_Value(X), m_Value(Y)),
-                                    m_c_UMin(m_Deferred(X), m_Deferred(Y))))))
-    return BinaryOperator::CreateWithCopiedFlags(Instruction::Mul, X, Y, &I);
-
   // (mul Op0 Op1):
   //    if Log2(Op0) folds away ->
   //        (shl Op1, Log2(Op0))
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 351fc3b0174fc..f3181dc14792c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -411,6 +411,14 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         getComplexity(I.getOperand(1)))
       Changed = !I.swapOperands();
 
+    if (I.isCommutative()) {
+      if (auto Pair = matchSymmetricPair(I.getOperand(0), I.getOperand(1))) {
+        replaceOperand(I, 0, Pair->first);
+        replaceOperand(I, 1, Pair->second);
+        Changed = true;
+      }
+    }
+
     BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
     BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
 
@@ -1096,8 +1104,8 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
-std::optional<std::pair<Value *, Value *>>
-InstCombinerImpl::matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
+static std::optional<std::pair<Value *, Value *>>
+matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
   if (LHS->getParent() != RHS->getParent())
     return std::nullopt;
 
@@ -1123,25 +1131,41 @@ InstCombinerImpl::matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
   return std::optional(std::pair(L0, R0));
 }
 
-Value *InstCombinerImpl::SimplifyPhiCommutativeBinaryOp(BinaryOperator &I,
-                                                        Value *Op0,
-                                                        Value *Op1) {
-  assert(I.isCommutative() && "Instruction should be commutative");
-
-  PHINode *LHS = dyn_cast<PHINode>(Op0);
-  PHINode *RHS = dyn_cast<PHINode>(Op1);
-
-  if (!LHS || !RHS)
-    return nullptr;
-
-  if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) {
-    Value *BI = Builder.CreateBinOp(I.getOpcode(), P->first, P->second);
-    if (auto *BO = dyn_cast<BinaryOperator>(BI))
-      BO->copyIRFlags(&I);
-    return BI;
+std::optional<std::pair<Value *, Value *>>
+InstCombinerImpl::matchSymmetricPair(Value *LHS, Value *RHS) {
+  Instruction *LHSInst = dyn_cast<Instruction>(LHS);
+  Instruction *RHSInst = dyn_cast<Instruction>(RHS);
+  if (!LHSInst || !RHSInst || LHSInst->getOpcode() != RHSInst->getOpcode())
+    return std::nullopt;
+  switch (LHSInst->getOpcode()) {
+  case Instruction::PHI:
+    return matchSymmetricPhiNodesPair(cast<PHINode>(LHS), cast<PHINode>(RHS));
+  case Instruction::Select: {
+    Value *Cond = LHSInst->getOperand(0);
+    Value *TrueVal = LHSInst->getOperand(1);
+    Value *FalseVal = LHSInst->getOperand(2);
+    if (Cond == RHSInst->getOperand(0) && TrueVal == RHSInst->getOperand(2) &&
+        FalseVal == RHSInst->getOperand(1))
+      return std::pair(TrueVal, FalseVal);
+    return std::nullopt;
+  }
+  case Instruction::Call: {
+    // Match min(a, b) and max(a, b)
+    MinMaxIntrinsic *LHSMinMax = dyn_cast<MinMaxIntrinsic>(LHSInst);
+    MinMaxIntrinsic *RHSMinMax = dyn_cast<MinMaxIntrinsic>(RHSInst);
+    if (LHSMinMax && RHSMinMax &&
+        LHSMinMax->getPredicate() ==
+            ICmpInst::getSwappedPredicate(RHSMinMax->getPredicate()) &&
+        ((LHSMinMax->getLHS() == RHSMinMax->getLHS() &&
+          LHSMinMax->getRHS() == RHSMinMax->getRHS()) ||
+         (LHSMinMax->getLHS() == RHSMinMax->getRHS() &&
+          LHSMinMax->getRHS() == RHSMinMax->getLHS())))
+      return std::pair(LHSMinMax->getLHS(), LHSMinMax->getRHS());
+    return std::nullopt;
+  }
+  default:
+    return std::nullopt;
   }
-
-  return nullptr;
 }
 
 Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
@@ -1187,14 +1211,6 @@ Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
   };
 
   if (LHSIsSelect && RHSIsSelect && A == D) {
-    // op(select(%v, %x, %y), select(%v, %y, %x)) --> op(%x, %y)
-    if (I.isCommutative() && B == F && C == E) {
-      Value *BI = Builder.CreateBinOp(I.getOpcode(), B, E);
-      if (auto *BO = dyn_cast<BinaryOperator>(BI))
-        BO->copyIRFlags(&I);
-      return BI;
-    }
-
     // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
     Cond = A;
     True = simplifyBinOp(Opcode, B, E, FMF, Q);
@@ -1577,11 +1593,6 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
       BO.getParent() != Phi1->getParent())
     return nullptr;
 
-  if (BO.isCommutative()) {
-    if (Value *V = SimplifyPhiCommutativeBinaryOp(BO, Phi0, Phi1))
-      return replaceInstUsesWith(BO, V);
-  }
-
   // Fold if there is at least one specific constant value in phi0 or phi1's
   // incoming values that comes from the same block and this specific constant
   // value can be used to do optimization for specific binary operator.
diff --git a/llvm/test/Transforms/InstCombine/minmax-of-minmax.ll b/llvm/test/Transforms/InstCombine/minmax-of-minmax.ll
index 097bb365a416a..e04e3c146924b 100644
--- a/llvm/test/Transforms/InstCombine/minmax-of-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-of-minmax.ll
@@ -245,9 +245,7 @@ define i32 @umin_of_smin_umax_wrong_pattern(i32 %x, i32 %y) {
 
 define i32 @smin_of_umin_umax_wrong_pattern2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @smin_of_umin_umax_wrong_pattern2(
-; CHECK-NEXT:    [[MAX:%.*]] = call i32 @llvm.umax.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
-; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 [[Y]])
-; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.smin.i32(i32 [[MAX]], i32 [[MIN]])
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %cmp1 = icmp ult i32 %x, %y

From cd28da390f8b8dedd00f9a2a383ec81e90436841 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu@apple.com>
Date: Thu, 4 Jan 2024 12:45:22 +0530
Subject: [PATCH 193/313] [LV] Change loops' interleave count computation
 (#73766)

[LV] Change loops' interleave count computation

A set of microbenchmarks in llvm-test-suite (https://github.com/llvm/llvm-test-suite/pull/56), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial when the vector loop runs at least twice or when the epilogue loop trip count (TC) is minimal. Therefore, we choose interleaving count (IC) between TC/VF & TC/2*VF (VF = vectorization factor), such that remainder TC for the epilogue loop is minimum while the IC is maximum in case the remainder TC is same for both.

The initial tests for this change were submitted in PRs:
https://github.com/llvm/llvm-project/pull/70272 and https://github.com/llvm/llvm-project/pull/74689.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  54 +-
 .../interleave_count_for_estimated_tc.ll      |  42 +-
 .../AArch64/interleave_count_for_known_tc.ll  |  20 +-
 .../LoopVectorize/PowerPC/large-loop-rdx.ll   |   4 -
 .../PowerPC/optimal-epilog-vectorization.ll   | 871 ++++++++----------
 .../LoopVectorize/PowerPC/reg-usage.ll        |   4 +-
 .../LoopVectorize/PowerPC/small-loop-rdx.ll   |   4 -
 .../LoopVectorize/X86/interleave_short_tc.ll  |   6 +-
 8 files changed, 465 insertions(+), 540 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f5f04615eedee..961d3d3bb1931 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5579,21 +5579,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
   }
 
-  // If trip count is known or estimated compile time constant, limit the
-  // interleave count to be less than the trip count divided by VF, provided it
-  // is at least 1.
-  //
-  // For scalable vectors we can't know if interleaving is beneficial. It may
-  // not be beneficial for small loops if none of the lanes in the second vector
-  // iterations is enabled. However, for larger loops, there is likely to be a
-  // similar benefit as for fixed-width vectors. For now, we choose to leave
-  // the InterleaveCount as if vscale is '1', although if some information about
-  // the vector is known (e.g. min vector size), we can make a better decision.
-  if (BestKnownTC) {
-    MaxInterleaveCount =
-        std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
-    // Make sure MaxInterleaveCount is greater than 0.
-    MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+  unsigned EstimatedVF = VF.getKnownMinValue();
+  if (VF.isScalable()) {
+    if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
+      EstimatedVF *= *VScale;
+  }
+  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
+
+  unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+  if (KnownTC) {
+    // If trip count is known we select between two prospective ICs, where
+    // 1) the aggressive IC is capped by the trip count divided by VF
+    // 2) the conservative IC is capped by the trip count divided by (VF * 2)
+    // The final IC is selected in a way that the epilogue loop trip count is
+    // minimized while maximizing the IC itself, so that we either run the
+    // vector loop at least once if it generates a small epilogue loop, or else
+    // we run the vector loop at least twice.
+
+    unsigned InterleaveCountUB = bit_floor(
+        std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+    unsigned InterleaveCountLB = bit_floor(std::max(
+        1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+    MaxInterleaveCount = InterleaveCountLB;
+
+    if (InterleaveCountUB != InterleaveCountLB) {
+      unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
+      unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+      // If both produce same scalar tail, maximize the IC to do the same work
+      // in fewer vector loop iterations
+      if (TailTripCountUB == TailTripCountLB)
+        MaxInterleaveCount = InterleaveCountUB;
+    }
+  } else if (BestKnownTC) {
+    // If trip count is an estimated compile time constant, limit the
+    // IC to be capped by the trip count divided by VF * 2, such that the vector
+    // loop runs at least twice to make interleaving seem profitable when there
+    // is an epilogue loop present. Since exact Trip count is not known we
+    // choose to be conservative in our IC estimate.
+    MaxInterleaveCount = bit_floor(std::max(
+        1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
   }
 
   assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 6d49d7159998c..5552f9dd70c95 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -5,9 +5,9 @@ target triple = "aarch64-linux-gnu"
 
 %pair = type { i8, i8 }
 
-; TODO: For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16, 
 ; it should conservatively choose IC 1 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -29,9 +29,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16, 
 ; it should conservatively choose IC 1 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -53,9 +53,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16, 
 ; it should conservatively choose IC 1 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -77,9 +77,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16, 
 ; it should conservatively choose IC 1 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -101,9 +101,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 2 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -125,9 +125,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 2 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -149,9 +149,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -173,9 +173,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -197,9 +197,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -221,9 +221,9 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -245,7 +245,7 @@ for.end:
   ret void
 }
 
-; TODO: For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16, 
 ; the IC will be capped by the target-specific maximum interleave count
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
 define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 828cbe76489a3..0569bfb2ae4e0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -77,9 +77,9 @@ for.end:
   ret void
 }
 
-; TODO: For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
+; For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 1 since there will be no remainder loop that needs to run after the vector loop.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_tc_48(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -101,9 +101,9 @@ for.end:
   ret void
 }
 
-; TODO: For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
+; For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_tc_49(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -125,9 +125,9 @@ for.end:
   ret void
 }
 
-; TODO: For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
+; For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_tc_55(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -149,9 +149,9 @@ for.end:
   ret void
 }
 
-; TODO: For this loop with known TC of 100, when the auto-vectorizer chooses VF 16, it should choose
+; For this loop with known TC of 100, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 2 since a remainder loop TC of 4 is more efficient than remainder loop TC of 36 with IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_100(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -245,9 +245,9 @@ for.end:
   ret void
 }
 
-; TODO: For this loop with known TC of 193, when the auto-vectorizer chooses VF 16, it should choose
+; For this loop with known TC of 193, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 4 since a remainder loop TC of 1 is more efficient than remainder loop TC of 65 with IC 8
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_tc_193(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
index a83bf7d41569f..ba073dc1590d1 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
@@ -8,10 +8,6 @@
 ; CHECK-NEXT: fadd
 ; CHECK-NEXT: fadd
 ; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
 ; CHECK-NEXT: =
 ; CHECK-NOT: fadd
 ; CHECK-SAME: >
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
index c358c835597dd..0bee21a481d06 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -S | FileCheck %s --check-prefix VF-TWO-CHECK
 ; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S | FileCheck %s --check-prefix VF-FOUR-CHECK
 
@@ -6,19 +7,20 @@ target triple = "powerpc64le-unknown-linux-gnu"
 
 ; Function Attrs: nounwind
 define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 signext %N) #0 {
-; VF-TWO-CHECK-LABEL: @f1(
+; VF-TWO-CHECK-LABEL: define dso_local void @f1(
+; VF-TWO-CHECK-SAME: ptr noalias [[AA:%.*]], ptr noalias [[BB:%.*]], ptr noalias [[CC:%.*]], i32 signext [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; VF-TWO-CHECK-NEXT:  entry:
-; VF-TWO-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; VF-TWO-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N]], 0
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
 ; VF-TWO-CHECK:       iter.check:
 ; VF-TWO-CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; VF-TWO-CHECK:       vector.main.loop.iter.check:
-; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 48
+; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VF-TWO-CHECK:       vector.ph:
-; VF-TWO-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 48
+; VF-TWO-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-TWO-CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; VF-TWO-CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vector.body:
@@ -31,133 +33,89 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 20
 ; VF-TWO-CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 24
 ; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 28
-; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 32
-; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 36
-; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 40
-; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 44
-; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[BB:%.*]], i64 [[TMP0]]
-; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP1]]
-; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP2]]
-; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP3]]
-; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP4]]
-; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP5]]
-; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP6]]
-; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP7]]
-; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP8]]
-; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP9]]
-; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP10]]
-; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP11]]
-; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 4
-; VF-TWO-CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8
-; VF-TWO-CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 12
-; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16
-; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 20
-; VF-TWO-CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24
-; VF-TWO-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 28
-; VF-TWO-CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 32
-; VF-TWO-CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 36
-; VF-TWO-CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 40
-; VF-TWO-CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 44
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[TMP0]]
-; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP1]]
-; VF-TWO-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP2]]
-; VF-TWO-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP3]]
-; VF-TWO-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP4]]
-; VF-TWO-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP5]]
-; VF-TWO-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP6]]
-; VF-TWO-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP7]]
-; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP8]]
-; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP9]]
-; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP10]]
-; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP11]]
-; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4
-; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8
-; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12
-; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 16
-; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20
-; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24
-; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28
-; VF-TWO-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 32
-; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 36
-; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 40
-; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 44
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, ptr [[TMP82]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD13]]
-; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD14]]
-; VF-TWO-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD15]]
-; VF-TWO-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD16]]
-; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD17]]
-; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD18]]
-; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD19]]
-; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD20]]
-; VF-TWO-CHECK-NEXT:    [[TMP92:%.*]] = fadd fast <4 x float> [[WIDE_LOAD9]], [[WIDE_LOAD21]]
-; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = fadd fast <4 x float> [[WIDE_LOAD10]], [[WIDE_LOAD22]]
-; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = fadd fast <4 x float> [[WIDE_LOAD11]], [[WIDE_LOAD23]]
-; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = fadd fast <4 x float> [[WIDE_LOAD12]], [[WIDE_LOAD24]]
-; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, ptr [[AA:%.*]], i64 [[TMP0]]
-; VF-TWO-CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP1]]
-; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP2]]
-; VF-TWO-CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP3]]
-; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP4]]
-; VF-TWO-CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP5]]
-; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP6]]
-; VF-TWO-CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP7]]
-; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP8]]
-; VF-TWO-CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP9]]
-; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP10]]
-; VF-TWO-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP11]]
-; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 4
-; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 8
-; VF-TWO-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 12
-; VF-TWO-CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 16
-; VF-TWO-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 20
-; VF-TWO-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 24
-; VF-TWO-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 28
-; VF-TWO-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 32
-; VF-TWO-CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 36
-; VF-TWO-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 40
-; VF-TWO-CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 44
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP108]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP110]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP112]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP114]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP88]], ptr [[TMP116]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP89]], ptr [[TMP118]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP90]], ptr [[TMP120]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP91]], ptr [[TMP122]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP92]], ptr [[TMP124]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP93]], ptr [[TMP126]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP94]], ptr [[TMP128]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP95]], ptr [[TMP130]], align 4
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 48
-; VF-TWO-CHECK-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP1]]
+; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP2]]
+; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP3]]
+; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP4]]
+; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP5]]
+; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP6]]
+; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP7]]
+; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 4
+; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 8
+; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 12
+; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 16
+; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 20
+; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 24
+; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 28
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP17]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP20]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP21]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP23]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP1]]
+; VF-TWO-CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP2]]
+; VF-TWO-CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP3]]
+; VF-TWO-CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP4]]
+; VF-TWO-CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP5]]
+; VF-TWO-CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP6]]
+; VF-TWO-CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP7]]
+; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 4
+; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 8
+; VF-TWO-CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 12
+; VF-TWO-CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 16
+; VF-TWO-CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 20
+; VF-TWO-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 24
+; VF-TWO-CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 28
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP35]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP38]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP39]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP40:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD9]]
+; VF-TWO-CHECK-NEXT:    [[TMP41:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD10]]
+; VF-TWO-CHECK-NEXT:    [[TMP42:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD11]]
+; VF-TWO-CHECK-NEXT:    [[TMP43:%.*]] = fadd fast <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD12]]
+; VF-TWO-CHECK-NEXT:    [[TMP44:%.*]] = fadd fast <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD13]]
+; VF-TWO-CHECK-NEXT:    [[TMP45:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD14]]
+; VF-TWO-CHECK-NEXT:    [[TMP46:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD15]]
+; VF-TWO-CHECK-NEXT:    [[TMP47:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD16]]
+; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP1]]
+; VF-TWO-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP2]]
+; VF-TWO-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP3]]
+; VF-TWO-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP4]]
+; VF-TWO-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP5]]
+; VF-TWO-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP6]]
+; VF-TWO-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP7]]
+; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4
+; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8
+; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12
+; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 16
+; VF-TWO-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20
+; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24
+; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP40]], ptr [[TMP56]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP57]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP58]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP43]], ptr [[TMP59]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP44]], ptr [[TMP60]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP45]], ptr [[TMP61]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP46]], ptr [[TMP62]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP47]], ptr [[TMP63]], align 4
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF-TWO-CHECK:       middle.block:
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -167,38 +125,38 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; VF-TWO-CHECK:       vec.epilog.ph:
 ; VF-TWO-CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; VF-TWO-CHECK-NEXT:    [[N_MOD_VF25:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2
-; VF-TWO-CHECK-NEXT:    [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]]
+; VF-TWO-CHECK-NEXT:    [[N_MOD_VF17:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2
+; VF-TWO-CHECK-NEXT:    [[N_VEC18:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF17]]
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP133]]
-; VF-TWO-CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds float, ptr [[TMP134]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x float>, ptr [[TMP135]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP137:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP133]]
-; VF-TWO-CHECK-NEXT:    [[TMP138:%.*]] = getelementptr inbounds float, ptr [[TMP137]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x float>, ptr [[TMP138]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]]
-; VF-TWO-CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP133]]
-; VF-TWO-CHECK-NEXT:    [[TMP142:%.*]] = getelementptr inbounds float, ptr [[TMP141]], i32 0
-; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP140]], ptr [[TMP142]], align 4
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX20]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP65]]
+; VF-TWO-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP66]], i32 0
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <2 x float>, ptr [[TMP67]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP65]]
+; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 0
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <2 x float>, ptr [[TMP69]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = fadd fast <2 x float> [[WIDE_LOAD21]], [[WIDE_LOAD22]]
+; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP65]]
+; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 0
+; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP70]], ptr [[TMP72]], align 4
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[INDEX20]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP73:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC18]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP73]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
-; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT:    [[CMP_N19:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC18]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N19]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
-; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-TWO-CHECK:       for.body:
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDVARS_IV]]
-; VF-TWO-CHECK-NEXT:    [[TMP145:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDVARS_IV]]
-; VF-TWO-CHECK-NEXT:    [[TMP146:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; VF-TWO-CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP145]], [[TMP146]]
+; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; VF-TWO-CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP74]], [[TMP75]]
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDVARS_IV]]
 ; VF-TWO-CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -209,19 +167,20 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK:       for.end:
 ; VF-TWO-CHECK-NEXT:    ret void
 ;
-; VF-FOUR-CHECK-LABEL: @f1(
+; VF-FOUR-CHECK-LABEL: define dso_local void @f1(
+; VF-FOUR-CHECK-SAME: ptr noalias [[AA:%.*]], ptr noalias [[BB:%.*]], ptr noalias [[CC:%.*]], i32 signext [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; VF-FOUR-CHECK-NEXT:  entry:
-; VF-FOUR-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; VF-FOUR-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N]], 0
 ; VF-FOUR-CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
 ; VF-FOUR-CHECK:       iter.check:
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; VF-FOUR-CHECK:       vector.main.loop.iter.check:
-; VF-FOUR-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 48
+; VF-FOUR-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VF-FOUR-CHECK:       vector.ph:
-; VF-FOUR-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 48
+; VF-FOUR-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-FOUR-CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; VF-FOUR-CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       vector.body:
@@ -234,133 +193,89 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 20
 ; VF-FOUR-CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 24
 ; VF-FOUR-CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 28
-; VF-FOUR-CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 32
-; VF-FOUR-CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 36
-; VF-FOUR-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 40
-; VF-FOUR-CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 44
-; VF-FOUR-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[BB:%.*]], i64 [[TMP0]]
-; VF-FOUR-CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP1]]
-; VF-FOUR-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP2]]
-; VF-FOUR-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP3]]
-; VF-FOUR-CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP4]]
-; VF-FOUR-CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP5]]
-; VF-FOUR-CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP6]]
-; VF-FOUR-CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP7]]
-; VF-FOUR-CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP8]]
-; VF-FOUR-CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP9]]
-; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP10]]
-; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP11]]
-; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 4
-; VF-FOUR-CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8
-; VF-FOUR-CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 12
-; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16
-; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 20
-; VF-FOUR-CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24
-; VF-FOUR-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 28
-; VF-FOUR-CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 32
-; VF-FOUR-CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 36
-; VF-FOUR-CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 40
-; VF-FOUR-CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 44
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[TMP0]]
-; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP1]]
-; VF-FOUR-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP2]]
-; VF-FOUR-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP3]]
-; VF-FOUR-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP4]]
-; VF-FOUR-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP5]]
-; VF-FOUR-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP6]]
-; VF-FOUR-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP7]]
-; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP8]]
-; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP9]]
-; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP10]]
-; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP11]]
-; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4
-; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8
-; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12
-; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 16
-; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20
-; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24
-; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28
-; VF-FOUR-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 32
-; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 36
-; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 40
-; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 44
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, ptr [[TMP82]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD13]]
-; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD14]]
-; VF-FOUR-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD15]]
-; VF-FOUR-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD16]]
-; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD17]]
-; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD18]]
-; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD19]]
-; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD20]]
-; VF-FOUR-CHECK-NEXT:    [[TMP92:%.*]] = fadd fast <4 x float> [[WIDE_LOAD9]], [[WIDE_LOAD21]]
-; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = fadd fast <4 x float> [[WIDE_LOAD10]], [[WIDE_LOAD22]]
-; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = fadd fast <4 x float> [[WIDE_LOAD11]], [[WIDE_LOAD23]]
-; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = fadd fast <4 x float> [[WIDE_LOAD12]], [[WIDE_LOAD24]]
-; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, ptr [[AA:%.*]], i64 [[TMP0]]
-; VF-FOUR-CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP1]]
-; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP2]]
-; VF-FOUR-CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP3]]
-; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP4]]
-; VF-FOUR-CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP5]]
-; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP6]]
-; VF-FOUR-CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP7]]
-; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP8]]
-; VF-FOUR-CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP9]]
-; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP10]]
-; VF-FOUR-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP11]]
-; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 4
-; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 8
-; VF-FOUR-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 12
-; VF-FOUR-CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 16
-; VF-FOUR-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 20
-; VF-FOUR-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 24
-; VF-FOUR-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 28
-; VF-FOUR-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 32
-; VF-FOUR-CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 36
-; VF-FOUR-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 40
-; VF-FOUR-CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 44
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP108]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP110]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP112]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP114]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP88]], ptr [[TMP116]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP89]], ptr [[TMP118]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP90]], ptr [[TMP120]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP91]], ptr [[TMP122]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP92]], ptr [[TMP124]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP93]], ptr [[TMP126]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP94]], ptr [[TMP128]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP95]], ptr [[TMP130]], align 4
-; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 48
-; VF-FOUR-CHECK-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP0]]
+; VF-FOUR-CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP1]]
+; VF-FOUR-CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP2]]
+; VF-FOUR-CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP3]]
+; VF-FOUR-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP4]]
+; VF-FOUR-CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP5]]
+; VF-FOUR-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP6]]
+; VF-FOUR-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP7]]
+; VF-FOUR-CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 4
+; VF-FOUR-CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 8
+; VF-FOUR-CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 12
+; VF-FOUR-CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 16
+; VF-FOUR-CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 20
+; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 24
+; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 28
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP17]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP20]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP21]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP23]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP0]]
+; VF-FOUR-CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP1]]
+; VF-FOUR-CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP2]]
+; VF-FOUR-CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP3]]
+; VF-FOUR-CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP4]]
+; VF-FOUR-CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP5]]
+; VF-FOUR-CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP6]]
+; VF-FOUR-CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP7]]
+; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 4
+; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 8
+; VF-FOUR-CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 12
+; VF-FOUR-CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 16
+; VF-FOUR-CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 20
+; VF-FOUR-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 24
+; VF-FOUR-CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 28
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP35]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP38]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP39]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP40:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD9]]
+; VF-FOUR-CHECK-NEXT:    [[TMP41:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD10]]
+; VF-FOUR-CHECK-NEXT:    [[TMP42:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD11]]
+; VF-FOUR-CHECK-NEXT:    [[TMP43:%.*]] = fadd fast <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD12]]
+; VF-FOUR-CHECK-NEXT:    [[TMP44:%.*]] = fadd fast <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD13]]
+; VF-FOUR-CHECK-NEXT:    [[TMP45:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD14]]
+; VF-FOUR-CHECK-NEXT:    [[TMP46:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD15]]
+; VF-FOUR-CHECK-NEXT:    [[TMP47:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD16]]
+; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP0]]
+; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP1]]
+; VF-FOUR-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP2]]
+; VF-FOUR-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP3]]
+; VF-FOUR-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP4]]
+; VF-FOUR-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP5]]
+; VF-FOUR-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP6]]
+; VF-FOUR-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP7]]
+; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4
+; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8
+; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12
+; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 16
+; VF-FOUR-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20
+; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24
+; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP40]], ptr [[TMP56]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP57]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP58]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP43]], ptr [[TMP59]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP44]], ptr [[TMP60]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP45]], ptr [[TMP61]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP46]], ptr [[TMP62]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP47]], ptr [[TMP63]], align 4
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF-FOUR-CHECK:       middle.block:
 ; VF-FOUR-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -370,38 +285,38 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; VF-FOUR-CHECK:       vec.epilog.ph:
 ; VF-FOUR-CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; VF-FOUR-CHECK-NEXT:    [[N_MOD_VF25:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
-; VF-FOUR-CHECK-NEXT:    [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]]
+; VF-FOUR-CHECK-NEXT:    [[N_MOD_VF17:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; VF-FOUR-CHECK-NEXT:    [[N_VEC18:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF17]]
 ; VF-FOUR-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.vector.body:
-; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP133]]
-; VF-FOUR-CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds float, ptr [[TMP134]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <4 x float>, ptr [[TMP135]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP137:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP133]]
-; VF-FOUR-CHECK-NEXT:    [[TMP138:%.*]] = getelementptr inbounds float, ptr [[TMP137]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, ptr [[TMP138]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]]
-; VF-FOUR-CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP133]]
-; VF-FOUR-CHECK-NEXT:    [[TMP142:%.*]] = getelementptr inbounds float, ptr [[TMP141]], i32 0
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP140]], ptr [[TMP142]], align 4
-; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX20]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP65]]
+; VF-FOUR-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP66]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP67]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP65]]
+; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = fadd fast <4 x float> [[WIDE_LOAD21]], [[WIDE_LOAD22]]
+; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP65]]
+; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 0
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP70]], ptr [[TMP72]], align 4
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[INDEX20]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP73:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC18]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP73]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF-FOUR-CHECK:       vec.epilog.middle.block:
-; VF-FOUR-CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-FOUR-CHECK-NEXT:    [[CMP_N19:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC18]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N19]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-FOUR-CHECK:       vec.epilog.scalar.ph:
-; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-FOUR-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       for.body:
 ; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDVARS_IV]]
-; VF-FOUR-CHECK-NEXT:    [[TMP145:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDVARS_IV]]
-; VF-FOUR-CHECK-NEXT:    [[TMP146:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; VF-FOUR-CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP145]], [[TMP146]]
+; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; VF-FOUR-CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP74]], [[TMP75]]
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDVARS_IV]]
 ; VF-FOUR-CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -443,9 +358,10 @@ for.end:                                          ; preds = %for.end.loopexit, %
 }
 
 define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) #0 {
-; VF-TWO-CHECK-LABEL: @f2(
+; VF-TWO-CHECK-LABEL: define dso_local signext i32 @f2(
+; VF-TWO-CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 signext [[N:%.*]]) #[[ATTR0]] {
 ; VF-TWO-CHECK-NEXT:  entry:
-; VF-TWO-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 1
+; VF-TWO-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N]], 1
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
 ; VF-TWO-CHECK:       iter.check:
 ; VF-TWO-CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
@@ -514,7 +430,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-TWO-CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP37]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP38]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP47:%.*]] = sext i32 [[TMP39]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP40]]
+; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP40]]
 ; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]]
 ; VF-TWO-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP42]]
 ; VF-TWO-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP43]]
@@ -524,76 +440,76 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-TWO-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP47]]
 ; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP56]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -4
-; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP59]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -8
+; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -4
+; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP58]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -8
+; VF-TWO-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP60]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -12
 ; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP62]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -12
-; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP65]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -16
+; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -16
+; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP64]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -20
+; VF-TWO-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP66]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -24
 ; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -20
-; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -24
-; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, ptr [[TMP74]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -28
-; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP77]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -28
+; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP70]], i32 -3
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP59]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP61]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP65]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP67]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP71]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP16]]
-; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP17]]
-; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]]
-; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; VF-TWO-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
-; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
-; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
-; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
-; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 4
-; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 8
-; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 12
-; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 16
-; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 20
-; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 24
-; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 28
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP80]], ptr [[TMP96]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP81]], ptr [[TMP98]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP82]], ptr [[TMP100]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP83]], ptr [[TMP102]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP104]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP106]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP108]], align 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP110]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP73:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP76:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP77:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP79:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]]
+; VF-TWO-CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP17]]
+; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]]
+; VF-TWO-CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
+; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
+; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; VF-TWO-CHECK-NEXT:    [[TMP86:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
+; VF-TWO-CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
+; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 4
+; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 8
+; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 12
+; VF-TWO-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 16
+; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 20
+; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 24
+; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 28
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP72]], ptr [[TMP88]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP73]], ptr [[TMP89]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP74]], ptr [[TMP90]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP75]], ptr [[TMP91]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP76]], ptr [[TMP92]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP77]], ptr [[TMP93]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP78]], ptr [[TMP94]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP79]], ptr [[TMP95]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP96]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF-TWO-CHECK:       middle.block:
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.iter.check:
-; VF-TWO-CHECK-NEXT:    [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-TWO-CHECK-NEXT:    [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-TWO-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -604,55 +520,56 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-TWO-CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32
-; VF-TWO-CHECK-NEXT:    [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP115:%.*]] = xor i32 [[TMP113]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP116:%.*]] = add i32 [[TMP115]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP117:%.*]] = sext i32 [[TMP116]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP117]]
-; VF-TWO-CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds float, ptr [[TMP118]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, ptr [[TMP119]], i32 -1
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x float>, ptr [[TMP120]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <2 x float> [[WIDE_LOAD24]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP122:%.*]] = fadd fast <2 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP114]]
-; VF-TWO-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, ptr [[TMP123]], i32 0
-; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP122]], ptr [[TMP124]], align 4
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[INDEX21:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = trunc i64 [[INDEX21]] to i32
+; VF-TWO-CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[OFFSET_IDX22]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = add i64 [[INDEX21]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP99:%.*]] = xor i32 [[TMP97]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = add i32 [[TMP99]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP101:%.*]] = sext i32 [[TMP100]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP101]]
+; VF-TWO-CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds float, ptr [[TMP102]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, ptr [[TMP103]], i32 -1
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <2 x float>, ptr [[TMP104]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE24:%.*]] = shufflevector <2 x float> [[WIDE_LOAD23]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; VF-TWO-CHECK-NEXT:    [[TMP105:%.*]] = fadd fast <2 x float> [[REVERSE24]], <float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP98]]
+; VF-TWO-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, ptr [[TMP106]], i32 0
+; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP105]], ptr [[TMP107]], align 4
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT25]] = add nuw i64 [[INDEX21]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC17]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP108]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
 ; VF-TWO-CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
 ; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-TWO-CHECK:       for.body:
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP127:%.*]] = xor i32 [[I_014]], -1
-; VF-TWO-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP127]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP109:%.*]] = xor i32 [[I_014]], -1
+; VF-TWO-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP109]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
-; VF-TWO-CHECK-NEXT:    [[TMP128:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; VF-TWO-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00
+; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; VF-TWO-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP110]], 1.000000e+00
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
 ; VF-TWO-CHECK-NEXT:    store float [[CONV3]], ptr [[ARRAYIDX5]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; VF-TWO-CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_014]], 1
 ; VF-TWO-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF-TWO-CHECK:       for.end.loopexit:
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_END]]
 ; VF-TWO-CHECK:       for.end:
 ; VF-TWO-CHECK-NEXT:    ret i32 0
 ;
-; VF-FOUR-CHECK-LABEL: @f2(
+; VF-FOUR-CHECK-LABEL: define dso_local signext i32 @f2(
+; VF-FOUR-CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 signext [[N:%.*]]) #[[ATTR0]] {
 ; VF-FOUR-CHECK-NEXT:  entry:
-; VF-FOUR-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 1
+; VF-FOUR-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N]], 1
 ; VF-FOUR-CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
 ; VF-FOUR-CHECK:       iter.check:
 ; VF-FOUR-CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
@@ -721,7 +638,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-FOUR-CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP37]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP38]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP47:%.*]] = sext i32 [[TMP39]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP40]]
+; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP40]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP42]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP43]]
@@ -731,76 +648,76 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-FOUR-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP47]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP56]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -4
-; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP59]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -8
+; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -4
+; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP58]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -8
+; VF-FOUR-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP60]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -12
 ; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP62]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -12
-; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP65]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -16
+; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -16
+; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP64]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -20
+; VF-FOUR-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP66]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -24
 ; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -20
-; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -24
-; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, ptr [[TMP74]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -28
-; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP77]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -28
+; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP70]], i32 -3
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP59]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP61]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP65]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP67]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP71]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP16]]
-; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP17]]
-; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]]
-; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; VF-FOUR-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
-; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
-; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
-; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
-; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 4
-; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 8
-; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 12
-; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 16
-; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 20
-; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 24
-; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 28
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP80]], ptr [[TMP96]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP81]], ptr [[TMP98]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP82]], ptr [[TMP100]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP83]], ptr [[TMP102]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP104]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP106]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP108]], align 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP110]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP73:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP76:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP77:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP79:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]]
+; VF-FOUR-CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP17]]
+; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]]
+; VF-FOUR-CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
+; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
+; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; VF-FOUR-CHECK-NEXT:    [[TMP86:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
+; VF-FOUR-CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
+; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 4
+; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 8
+; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 12
+; VF-FOUR-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 16
+; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 20
+; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 24
+; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 28
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP72]], ptr [[TMP88]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP73]], ptr [[TMP89]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP74]], ptr [[TMP90]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP75]], ptr [[TMP91]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP76]], ptr [[TMP92]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP77]], ptr [[TMP93]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP78]], ptr [[TMP94]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP79]], ptr [[TMP95]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP96]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF-FOUR-CHECK:       middle.block:
 ; VF-FOUR-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.iter.check:
-; VF-FOUR-CHECK-NEXT:    [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-FOUR-CHECK-NEXT:    [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-FOUR-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-FOUR-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -811,41 +728,41 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-FOUR-CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32
 ; VF-FOUR-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.vector.body:
-; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32
-; VF-FOUR-CHECK-NEXT:    [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP115:%.*]] = xor i32 [[TMP113]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP116:%.*]] = add i32 [[TMP115]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP117:%.*]] = sext i32 [[TMP116]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP117]]
-; VF-FOUR-CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds float, ptr [[TMP118]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, ptr [[TMP119]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, ptr [[TMP120]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x float> [[WIDE_LOAD24]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP122:%.*]] = fadd fast <4 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP114]]
-; VF-FOUR-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, ptr [[TMP123]], i32 0
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP122]], ptr [[TMP124]], align 4
-; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    [[INDEX21:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = trunc i64 [[INDEX21]] to i32
+; VF-FOUR-CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[OFFSET_IDX22]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = add i64 [[INDEX21]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP99:%.*]] = xor i32 [[TMP97]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = add i32 [[TMP99]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP101:%.*]] = sext i32 [[TMP100]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP101]]
+; VF-FOUR-CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds float, ptr [[TMP102]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, ptr [[TMP103]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP104]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE24:%.*]] = shufflevector <4 x float> [[WIDE_LOAD23]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP105:%.*]] = fadd fast <4 x float> [[REVERSE24]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP98]]
+; VF-FOUR-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, ptr [[TMP106]], i32 0
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP105]], ptr [[TMP107]], align 4
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT25]] = add nuw i64 [[INDEX21]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC17]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP108]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF-FOUR-CHECK:       vec.epilog.middle.block:
 ; VF-FOUR-CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-FOUR-CHECK:       vec.epilog.scalar.ph:
 ; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-FOUR-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       for.body:
 ; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[TMP127:%.*]] = xor i32 [[I_014]], -1
-; VF-FOUR-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP127]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP109:%.*]] = xor i32 [[I_014]], -1
+; VF-FOUR-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP109]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
-; VF-FOUR-CHECK-NEXT:    [[TMP128:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; VF-FOUR-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00
+; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; VF-FOUR-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP110]], 1.000000e+00
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
 ; VF-FOUR-CHECK-NEXT:    store float [[CONV3]], ptr [[ARRAYIDX5]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -889,14 +806,6 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret i32 0
 }
 
-; VF-TWO-CHECK-DAG: [[LOOPID_MV]] = distinct !{[[LOOPID_MV]], [[LOOPID_DISABLE_VECT:!.*]], [[LOOPID_DISABLE_UNROLL:!.*]]}
-; VF-TWO-CHECK-DAG: [[LOOPID_EV]] = distinct !{[[LOOPID_EV]], [[LOOPID_DISABLE_VECT]], [[LOOPID_DISABLE_UNROLL]]}
-; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_VECT]] = [[DISABLE_VECT_STR:!{!"llvm.loop.isvectorized".*}.*]]
-; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_UNROLL]] = [[DISABLE_UNROLL_STR:!{!"llvm.loop.unroll.runtime.disable"}.*]]
 ;
-; VF-FOUR-CHECK-DAG: [[LOOPID_MV_CM]] = distinct !{[[LOOPID_MV_CM]], [[LOOPID_DISABLE_VECT_CM:!.*]], [[LOOPID_DISABLE_UNROLL_CM:!.*]]}
-; VF-FOUR-CHECK-DAG: [[LOOPID_EV_CM]] = distinct !{[[LOOPID_EV_CM]], [[LOOPID_DISABLE_VECT_CM]], [[LOOPID_DISABLE_UNROLL_CM]]}
-; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_VECT_CM]] = [[DISABLE_VECT_STR_CM:!{!"llvm.loop.isvectorized".*}.*]]
-; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_UNROLL_CM]] = [[DISABLE_UNROLL_STR_CM:!{!"llvm.loop.unroll.runtime.disable"}.*]]
 ;
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
index f9d512e74e1b4..aac9aff3d391f 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
@@ -79,7 +79,7 @@ for.body:                                         ; preds = %for.body, %entry
 define i64 @bar(ptr nocapture %a) {
 ; CHECK-LABEL: bar
 
-; CHECK: Executing best plan with VF=2, UF=12
+; CHECK: Executing best plan with VF=2, UF=8
 
 entry:
   br label %for.body
@@ -107,7 +107,7 @@ for.body:
 
 define void @hoo(i32 %n) {
 ; CHECK-LABEL: hoo
-; CHECK: Executing best plan with VF=1, UF=12
+; CHECK: Executing best plan with VF=1, UF=8
 
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
index a567045c53c6e..ce83ab460a6c2 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
@@ -9,10 +9,6 @@
 ; CHECK-NEXT: fadd
 ; CHECK-NEXT: fadd
 ; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
 ; CHECK-NOT: fadd
 ; CHECK: middle.block
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave_short_tc.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave_short_tc.ll
index d3604b51a82f9..4857d27df6331 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave_short_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave_short_tc.ll
@@ -1,4 +1,4 @@
-; Check that we won't interleave by more than "best known" estimated trip count.
+; Check that we won't interleave by more than half the "best known" estimated trip count.
 
 ; The loop is expected to be vectorized by 4 and interleaving suppresed due to
 ; short trip count which is controled by "tiny-trip-count-interleave-threshold".
@@ -12,7 +12,7 @@
 ; Thus the resulting step is 4.
 ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -vectorizer-min-trip-count=4 -tiny-trip-count-interleave-threshold=4 -S < %s |  FileCheck %s
 
-; Check that we won't interleave by more than "best known" estimated trip count.
+; Check that we won't interleave by more than half the "best known" estimated trip count.
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -56,4 +56,4 @@ for.body:                                         ; preds = %for.body, %for.body
   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !prof !1
 }
 
-!1 = !{!"branch_weights", i32 1, i32 5}
+!1 = !{!"branch_weights", i32 1, i32 9}

From 55395f5c8375d3fce1ccbf0ab75f3539c56d61c7 Mon Sep 17 00:00:00 2001
From: sstipanovic <146831748+sstipanovic@users.noreply.github.com>
Date: Thu, 4 Jan 2024 08:22:05 +0100
Subject: [PATCH 194/313] [AMDGPU] Remove `nosync` from image atomic
 intrinsics. (#76814)

Remove `nosync` as discussed in
https://github.com/llvm/llvm-project/pull/73613
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   7 +-
 .../CodeGen/AMDGPU/image-atomic-attributes.ll | 356 ++++++++++++++++++
 2 files changed, 360 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 531b111235452..e5596258847f9 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -837,7 +837,7 @@ class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
 // All dimension-aware intrinsics are derived from this class.
 class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
                               list<IntrinsicProperty> props,
-                              list<SDNodeProperty> sdnodeprops> : DefaultAttrsIntrinsic<
+                              list<SDNodeProperty> sdnodeprops> : Intrinsic<
     P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
     !listconcat(
       !foreach(arg, P_.DataArgs, arg.Type),      // vdata(VGPR) -- for store/atomic
@@ -851,11 +851,12 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
                                                  //   gfx12+ imm: bits [0-2] = th, bits [3-4] = scope)
                                                  // TODO-GFX12: Update all other cachepolicy descriptions.
 
-     !listconcat(props,
+     !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn],
           !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
           !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
           [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
-           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
+           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>],
+          !if(P_.IsAtomic, [], [IntrNoSync])),
 
 
       "", sdnodeprops>,
diff --git a/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll b/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll
new file mode 100644
index 0000000000000..dd60d9d702716
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll
@@ -0,0 +1,356 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefixes=CHECK %s
+
+define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_swap_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps <2 x float> @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps <2 x float> @atomic_swap_1d_i64(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i64 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i64 [[V]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[OUT]]
+;
+main_body:
+  %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i64 %v to <2 x float>
+  ret <2 x float> %out
+}
+
+define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_sub_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_smin_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_umin_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_smax_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_umax_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_and_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_or_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_xor_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_inc_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_dec_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_cmpswap_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[CMP:%.*]], i32 [[SWAP:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 [[CMP]], i32 [[SWAP]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps <2 x float> @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps <2 x float> @atomic_cmpswap_1d_64(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i64 [[CMP:%.*]], i64 [[SWAP:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 [[CMP]], i64 [[SWAP]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i64 [[V]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[OUT]]
+;
+main_body:
+  %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i64 %v to <2 x float>
+  ret <2 x float> %out
+}
+
+define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_2d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_3d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[R:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[R]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_cube(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FACE:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[FACE]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_1darray(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[SLICE:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[SLICE]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_2darray(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[SLICE]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_2dmsaa(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_2darraymsaa(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]], i32 [[FRAGID:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_1d_slc(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 2)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64, i32, <8 x i32>, i32, i32) #0
+declare i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64, i64, i32, <8 x i32>, i32, i32) #0
+
+declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn }
+;.

From 5550e9c841465c54c6f28e246d835daf3520a2ca Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 4 Jan 2024 07:26:23 +0000
Subject: [PATCH 195/313] [GlobalISel][AArch64] Add libcall lowering for fpowi.
 (#67114)

This adds legalization, notably libcall lowering for fpowi. It is a
little different to other methods as the function takes both a float and
integer register. Otherwise all vectors get scalarized and fp16 is
promoted to fp32.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   25 +
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |    4 +
 .../GlobalISel/legalizer-info-validation.mir  |    4 +-
 llvm/test/CodeGen/AArch64/fpowi.ll            | 1424 +++++++++++++++++
 4 files changed, 1455 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fpowi.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 7522353ab966f..d2c41ebae5734 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -532,6 +532,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     RTLIBCASE(REM_F);
   case TargetOpcode::G_FPOW:
     RTLIBCASE(POW_F);
+  case TargetOpcode::G_FPOWI:
+    RTLIBCASE(POWI_F);
   case TargetOpcode::G_FMA:
     RTLIBCASE(FMA_F);
   case TargetOpcode::G_FSIN:
@@ -1014,6 +1016,27 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_FPOWI: {
+    LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
+    unsigned Size = LLTy.getSizeInBits();
+    Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
+    Type *ITy = IntegerType::get(
+        Ctx, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
+    if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
+      LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
+      return UnableToLegalize;
+    }
+    auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
+    std::initializer_list<CallLowering::ArgInfo> Args = {
+        {MI.getOperand(1).getReg(), HLTy, 0},
+        {MI.getOperand(2).getReg(), ITy, 1}};
+    LegalizeResult Status =
+        createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), HLTy, 0},
+                      Args, LocObserver, &MI);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
   case TargetOpcode::G_FPEXT:
   case TargetOpcode::G_FPTRUNC: {
     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
@@ -4557,6 +4580,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
   case G_SHUFFLE_VECTOR:
     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
+  case G_FPOWI:
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
   default:
     return UnableToLegalize;
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index bb5f1f69c79c5..149119c657272 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -282,6 +282,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
       .minScalar(0, s32)
       .libcallFor({s32, s64});
+  getActionDefinitionsBuilder(G_FPOWI)
+      .scalarize(0)
+      .minScalar(0, s32)
+      .libcallFor({{s32, s32}, {s64, s32}});
 
   getActionDefinitionsBuilder(G_INSERT)
       .legalIf(all(typeInSet(0, {s32, s64, p0}),
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 866fe6fa8cb3f..9940f439dd714 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -463,8 +463,8 @@
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FPOWI (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FEXP (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
new file mode 100644
index 0000000000000..677d2e0416aec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -0,0 +1,1424 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define double @powi_f64(double %a, i32 %b) {
+; CHECK-LABEL: powi_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b __powidf2
+entry:
+  %c = call double @llvm.powi.f64.i32(double %a, i32 %b)
+  ret double %c
+}
+
+define float @powi_f32(float %a, i32 %b) {
+; CHECK-LABEL: powi_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b __powisf2
+entry:
+  %c = call float @llvm.powi.f32.i32(float %a, i32 %b)
+  ret float %c
+}
+
+define half @powi_f16(half %a, i32 %b) {
+; CHECK-LABEL: powi_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl __powisf2
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = call half @llvm.powi.f16.i32(half %a, i32 %b)
+  ret half %c
+}
+
+define <2 x double> @powi_v2f64(<2 x double> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v2f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov d0, v0.d[1]
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v2f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str d8, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -32
+; CHECK-GI-NEXT:    mov d8, v0.d[1]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> %a, i32 %b)
+  ret <2 x double> %c
+}
+
+define <3 x double> @powi_v3f64(<3 x double> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v3f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str d10, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    .cfi_offset b10, -48
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    fmov d8, d2
+; CHECK-SD-NEXT:    fmov d9, d1
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    fmov d10, d0
+; CHECK-SD-NEXT:    fmov d0, d9
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    fmov d9, d0
+; CHECK-SD-NEXT:    fmov d0, d8
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    fmov d1, d9
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    fmov d0, d10
+; CHECK-SD-NEXT:    ldr d10, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v3f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str d10, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -48
+; CHECK-GI-NEXT:    fmov d8, d1
+; CHECK-GI-NEXT:    fmov d9, d2
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    fmov d10, d0
+; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    fmov d0, d9
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    fmov d1, d8
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    fmov d0, d10
+; CHECK-GI-NEXT:    ldr d10, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <3 x double> @llvm.powi.v3f64.i32(<3 x double> %a, i32 %b)
+  ret <3 x double> %c
+}
+
+define <4 x double> @powi_v4f64(<4 x double> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v4f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov d0, v0.d[1]
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov d0, v0.d[1]
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    bl __powidf2
+; CHECK-SD-NEXT:    fmov d1, d0
+; CHECK-SD-NEXT:    ldp q2, q0, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v4f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov d8, v0.d[1]
+; CHECK-GI-NEXT:    mov d9, v1.d[1]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov d0, d9
+; CHECK-GI-NEXT:    bl __powidf2
+; CHECK-GI-NEXT:    ldp q1, q2, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v2.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %a, i32 %b)
+  ret <4 x double> %c
+}
+
+define <2 x float> @powi_v2f32(<2 x float> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v2f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v2f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str d8, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -32
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s8
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-GI-NEXT:    fmov d0, d1
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> %a, i32 %b)
+  ret <2 x float> %c
+}
+
+define <3 x float> @powi_v3f32(<3 x float> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[2]
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov s9, v0.s[2]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s8
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s9
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> %a, i32 %b)
+  ret <3 x float> %c
+}
+
+define <4 x float> @powi_v4f32(<4 x float> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v4f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[2]
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[3]
+; CHECK-SD-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v4f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #96
+; CHECK-GI-NEXT:    str d10, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -48
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov s9, v0.s[2]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov s10, v0.s[3]
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s8
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s9
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s10
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[2], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #96
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %a, i32 %b)
+  ret <4 x float> %c
+}
+
+define <8 x float> @powi_v8f32(<8 x float> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v8f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[2]
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[3]
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[2]
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[3]
+; CHECK-SD-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fmov s2, s0
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.s[3], v2.s[0]
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v8f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #176
+; CHECK-GI-NEXT:    stp d13, d12, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -40
+; CHECK-GI-NEXT:    .cfi_offset b11, -48
+; CHECK-GI-NEXT:    .cfi_offset b12, -56
+; CHECK-GI-NEXT:    .cfi_offset b13, -64
+; CHECK-GI-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov s9, v0.s[2]
+; CHECK-GI-NEXT:    mov s10, v0.s[3]
+; CHECK-GI-NEXT:    mov s11, v1.s[1]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov s12, v1.s[2]
+; CHECK-GI-NEXT:    mov s13, v1.s[3]
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s8
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s9
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s10
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s11
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s12
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov s0, s13
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.s[1], v2.s[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d13, d12, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[3], v2.s[0]
+; CHECK-GI-NEXT:    mov v3.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    add sp, sp, #176
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %a, i32 %b)
+  ret <8 x float> %c
+}
+
+define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v7f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[2]
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[4]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[5]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[5], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[6]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[6], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h1, s0
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v7f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #160
+; CHECK-GI-NEXT:    stp d13, d12, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -40
+; CHECK-GI-NEXT:    .cfi_offset b11, -48
+; CHECK-GI-NEXT:    .cfi_offset b12, -56
+; CHECK-GI-NEXT:    .cfi_offset b13, -64
+; CHECK-GI-NEXT:    mov h8, v0.h[1]
+; CHECK-GI-NEXT:    mov h9, v0.h[2]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov h10, v0.h[3]
+; CHECK-GI-NEXT:    mov h11, v0.h[4]
+; CHECK-GI-NEXT:    mov h12, v0.h[5]
+; CHECK-GI-NEXT:    mov h13, v0.h[6]
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h8
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h9
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h10
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h11
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h12
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h13
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #160
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <7 x half> @llvm.powi.v7f16.i32(<7 x half> %a, i32 %b)
+  ret <7 x half> %c
+}
+
+define <4 x half> @powi_v4f16(<4 x half> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v4f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    fcvt s1, h1
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fmov s0, s1
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt h2, s0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov h1, v1.h[2]
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v2.h[1], v1.h[0]
+; CHECK-SD-NEXT:    str q2, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt h2, s0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov h1, v1.h[3]
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-SD-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h1, s0
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v4f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #96
+; CHECK-GI-NEXT:    str d10, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -48
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h8, v0.h[1]
+; CHECK-GI-NEXT:    mov h9, v0.h[2]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov h10, v0.h[3]
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h8
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h9
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h10
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    add sp, sp, #96
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <4 x half> @llvm.powi.v4f16.i32(<4 x half> %a, i32 %b)
+  ret <4 x half> %c
+}
+
+define <8 x half> @powi_v8f16(<8 x half> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v8f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[2]
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[4]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[5]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[5], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[6]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[6], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h1, s0
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v8f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #192
+; CHECK-GI-NEXT:    str d14, [sp, #112] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp d13, d12, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #176] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 192
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -40
+; CHECK-GI-NEXT:    .cfi_offset b11, -48
+; CHECK-GI-NEXT:    .cfi_offset b12, -56
+; CHECK-GI-NEXT:    .cfi_offset b13, -64
+; CHECK-GI-NEXT:    .cfi_offset b14, -80
+; CHECK-GI-NEXT:    mov h8, v0.h[1]
+; CHECK-GI-NEXT:    mov h9, v0.h[2]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov h10, v0.h[3]
+; CHECK-GI-NEXT:    mov h11, v0.h[4]
+; CHECK-GI-NEXT:    mov h12, v0.h[5]
+; CHECK-GI-NEXT:    mov h13, v0.h[6]
+; CHECK-GI-NEXT:    mov h14, v0.h[7]
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h8
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h9
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h10
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h11
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h12
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h13
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h14
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d13, d12, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #192
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <8 x half> @llvm.powi.v8f16.i32(<8 x half> %a, i32 %b)
+  ret <8 x half> %c
+}
+
+define <16 x half> @powi_v16f16(<16 x half> %a, i32 %b) {
+; CHECK-SD-LABEL: powi_v16f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[2]
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[4]
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[5]
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[5], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[6]
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[6], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[7], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[1]
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[2]
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[4]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[5]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[5], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[6]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    mov v1.h[6], v0.h[0]
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    bl __powisf2
+; CHECK-SD-NEXT:    fmov s1, s0
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt h2, s1
+; CHECK-SD-NEXT:    ldp q1, q0, [sp, #16] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.h[7], v2.h[0]
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: powi_v16f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #336
+; CHECK-GI-NEXT:    stp d15, d14, [sp, #240] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d13, d12, [sp, #256] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #272] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #288] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x29, [sp, #304] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #320] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 336
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset w29, -32
+; CHECK-GI-NEXT:    .cfi_offset b8, -40
+; CHECK-GI-NEXT:    .cfi_offset b9, -48
+; CHECK-GI-NEXT:    .cfi_offset b10, -56
+; CHECK-GI-NEXT:    .cfi_offset b11, -64
+; CHECK-GI-NEXT:    .cfi_offset b12, -72
+; CHECK-GI-NEXT:    .cfi_offset b13, -80
+; CHECK-GI-NEXT:    .cfi_offset b14, -88
+; CHECK-GI-NEXT:    .cfi_offset b15, -96
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov h14, v1.h[1]
+; CHECK-GI-NEXT:    mov h1, v1.h[2]
+; CHECK-GI-NEXT:    mov h8, v0.h[1]
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov h9, v0.h[2]
+; CHECK-GI-NEXT:    mov h10, v0.h[3]
+; CHECK-GI-NEXT:    mov h11, v0.h[4]
+; CHECK-GI-NEXT:    mov h12, v0.h[5]
+; CHECK-GI-NEXT:    mov h13, v0.h[6]
+; CHECK-GI-NEXT:    mov h15, v0.h[7]
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    str h1, [sp, #16] // 2-byte Folded Spill
+; CHECK-GI-NEXT:    mov h1, v2.h[3]
+; CHECK-GI-NEXT:    str h1, [sp, #32] // 2-byte Folded Spill
+; CHECK-GI-NEXT:    mov h1, v2.h[4]
+; CHECK-GI-NEXT:    str h1, [sp, #48] // 2-byte Folded Spill
+; CHECK-GI-NEXT:    mov h1, v2.h[5]
+; CHECK-GI-NEXT:    str h1, [sp, #80] // 2-byte Folded Spill
+; CHECK-GI-NEXT:    mov h1, v2.h[6]
+; CHECK-GI-NEXT:    str h1, [sp, #112] // 2-byte Folded Spill
+; CHECK-GI-NEXT:    mov h1, v2.h[7]
+; CHECK-GI-NEXT:    str h1, [sp, #176] // 2-byte Folded Spill
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h8
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h9
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h10
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h11
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h12
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h13
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h15
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    fcvt s1, h14
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr h1, [sp, #80] // 2-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr h1, [sp, #112] // 2-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr h1, [sp, #176] // 2-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    bl __powisf2
+; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x29, [sp, #304] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #320] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #288] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #272] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #224] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d13, d12, [sp, #256] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d15, d14, [sp, #240] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #208] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q4, [sp, #112] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.h[5], v4.h[0]
+; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
+; CHECK-GI-NEXT:    fcvt h2, s0
+; CHECK-GI-NEXT:    ldr q0, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
+; CHECK-GI-NEXT:    ldr q0, [sp, #176] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
+; CHECK-GI-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.h[7], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[7], v2.h[0]
+; CHECK-GI-NEXT:    mov v0.16b, v3.16b
+; CHECK-GI-NEXT:    add sp, sp, #336
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = call <16 x half> @llvm.powi.v16f16.i32(<16 x half> %a, i32 %b)
+  ret <16 x half> %c
+}
+
+declare <16 x half> @llvm.powi.v16f16.i32(<16 x half>, i32)
+declare <2 x double> @llvm.powi.v2f64.i32(<2 x double>, i32)
+declare <2 x float> @llvm.powi.v2f32.i32(<2 x float>, i32)
+declare <3 x double> @llvm.powi.v3f64.i32(<3 x double>, i32)
+declare <3 x float> @llvm.powi.v3f32.i32(<3 x float>, i32)
+declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32)
+declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
+declare <4 x half> @llvm.powi.v4f16.i32(<4 x half>, i32)
+declare <7 x half> @llvm.powi.v7f16.i32(<7 x half>, i32)
+declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32)
+declare <8 x half> @llvm.powi.v8f16.i32(<8 x half>, i32)
+declare double @llvm.powi.f64.i32(double, i32)
+declare float @llvm.powi.f32.i32(float, i32)
+declare half @llvm.powi.f16.i32(half, i32)

From 176c341198cbfa05debc3554e958ea90e0ef3cc9 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Thu, 4 Jan 2024 15:29:42 +0800
Subject: [PATCH 196/313] [X86][BF16] Add 32-bit tests to show ABI problem, NFC

---
 llvm/test/CodeGen/X86/bfloat.ll | 500 ++++++++++++++++++++++++++++++++
 1 file changed, 500 insertions(+)

diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 9c65310f79d7e..7ef362619d5fd 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1,10 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,AVXNC
 
 define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
+; X86-LABEL: add:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esi)
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: add:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbx
@@ -46,6 +71,21 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 }
 
 define bfloat @add2(bfloat %a, bfloat %b) nounwind {
+; X86-LABEL: add2:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: add2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rax
@@ -78,6 +118,47 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
 }
 
 define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
+; X86-LABEL: add_double:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %edi
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    vmovd %edi, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esi)
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: add_double:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
@@ -146,6 +227,41 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
 }
 
 define double @add_double2(double %da, double %db) nounwind {
+; X86-LABEL: add_double2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %esi
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    shll $16, %esi
+; X86-NEXT:    vmovd %esi, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: add_double2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbx
@@ -203,6 +319,26 @@ define double @add_double2(double %da, double %db) nounwind {
 }
 
 define void @add_constant(ptr %pa, ptr %pc) nounwind {
+; X86-LABEL: add_constant:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esi)
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: add_constant:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbx
@@ -237,6 +373,18 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
 }
 
 define bfloat @add_constant2(bfloat %a) nounwind {
+; X86-LABEL: add_constant2:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: add_constant2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rax
@@ -263,6 +411,12 @@ define bfloat @add_constant2(bfloat %a) nounwind {
 }
 
 define void @store_constant(ptr %pc) nounwind {
+; X86-LABEL: store_constant:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw $16256, (%eax) # imm = 0x3F80
+; X86-NEXT:    retl
+;
 ; CHECK-LABEL: store_constant:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movw $16256, (%rdi) # imm = 0x3F80
@@ -272,6 +426,14 @@ define void @store_constant(ptr %pc) nounwind {
 }
 
 define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
+; X86-LABEL: fold_ext_trunc:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    retl
+;
 ; CHECK-LABEL: fold_ext_trunc:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl (%rdi), %eax
@@ -285,6 +447,11 @@ define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
 }
 
 define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
+; X86-LABEL: fold_ext_trunc2:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+;
 ; CHECK-LABEL: fold_ext_trunc2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    retq
@@ -294,6 +461,165 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
 }
 
 define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
+; X86-LABEL: addv:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $172, %esp
+; X86-NEXT:    vpextrw $1, %xmm1, %eax
+; X86-NEXT:    vmovdqa %xmm1, %xmm4
+; X86-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm2
+; X86-NEXT:    vpextrw $1, %xmm0, %eax
+; X86-NEXT:    vmovdqa %xmm0, %xmm3
+; X86-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm2, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    vpextrw $2, %xmm4, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vpextrw $2, %xmm3, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $3, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpextrw $3, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $4, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpextrw $4, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $5, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpextrw $5, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $6, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpextrw $6, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $7, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpextrw $7, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovw %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vmovw %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; X86-NEXT:    addl $172, %esp
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: addv:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
@@ -776,6 +1102,12 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
 }
 
 define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
+; X86-LABEL: pr62997:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: pr62997:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movd %xmm0, %eax
@@ -798,6 +1130,11 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
 }
 
 define <32 x bfloat> @pr63017() {
+; X86-LABEL: pr63017:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: pr63017:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
@@ -820,6 +1157,12 @@ define <32 x bfloat> @pr63017() {
 }
 
 define <32 x bfloat> @pr63017_2() nounwind {
+; X86-LABEL: pr63017_2:
+; X86:       # %bb.0:
+; X86-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: pr63017_2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %r14
@@ -1644,6 +1987,12 @@ define <32 x bfloat> @pr63017_2() nounwind {
 }
 
 define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
+; X86-LABEL: pr62997_3:
+; X86:       # %bb.0:
+; X86-NEXT:    vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X86-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: pr62997_3:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm0, %rax
@@ -1678,6 +2027,12 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
 declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
 
 define <4 x float> @pr64460_1(<4 x bfloat> %a) {
+; X86-LABEL: pr64460_1:
+; X86:       # %bb.0:
+; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: pr64460_1:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pextrw $1, %xmm0, %eax
@@ -1709,6 +2064,12 @@ define <4 x float> @pr64460_1(<4 x bfloat> %a) {
 }
 
 define <8 x float> @pr64460_2(<8 x bfloat> %a) {
+; X86-LABEL: pr64460_2:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT:    vpslld $16, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: pr64460_2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm0, %rdx
@@ -1758,6 +2119,12 @@ define <8 x float> @pr64460_2(<8 x bfloat> %a) {
 }
 
 define <16 x float> @pr64460_3(<16 x bfloat> %a) {
+; X86-LABEL: pr64460_3:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X86-NEXT:    vpslld $16, %zmm0, %zmm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: pr64460_3:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm1, %rdi
@@ -1852,6 +2219,13 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) {
 }
 
 define <8 x double> @pr64460_4(<8 x bfloat> %a) {
+; X86-LABEL: pr64460_4:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT:    vpslld $16, %ymm0, %ymm0
+; X86-NEXT:    vcvtps2pd %ymm0, %zmm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: pr64460_4:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm0, %rsi
@@ -1951,6 +2325,13 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) {
 }
 
 define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
+; X86-LABEL: fptrunc_v4f32:
+; X86:       # %bb.0:
+; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: fptrunc_v4f32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
@@ -2009,6 +2390,12 @@ define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
 }
 
 define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
+; X86-LABEL: fptrunc_v8f32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: fptrunc_v8f32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
@@ -2087,6 +2474,11 @@ define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
 }
 
 define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
+; X86-LABEL: fptrunc_v16f32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtneps2bf16 %zmm0, %ymm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: fptrunc_v16f32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
@@ -2222,6 +2614,91 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
 }
 
 define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
+; X86-LABEL: fptrunc_v8f64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $220, %esp
+; X86-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
+; X86-NEXT:    vmovhps %xmm0, (%esp)
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
+; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovhps %xmm0, (%esp)
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
+; X86-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovhps %xmm0, (%esp)
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
+; X86-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovhps %xmm0, (%esp)
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
+; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; X86-NEXT:    addl $220, %esp
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: fptrunc_v8f64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
@@ -2456,6 +2933,12 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 }
 
 define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
+; X86-LABEL: test_v8bf16_v32bf16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: test_v8bf16_v32bf16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm0
@@ -2480,6 +2963,12 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
 }
 
 define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; X86-LABEL: concat_v8bf16:
+; X86:       # %bb.0:
+; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: concat_v8bf16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    retq
@@ -2494,6 +2983,12 @@ define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 }
 
 define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
+; X86-LABEL: extract_v32bf16_v8bf16:
+; X86:       # %bb.0:
+; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: extract_v32bf16_v8bf16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pextrw $0, %xmm1, %eax
@@ -2531,6 +3026,11 @@ define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
 }
 
 define <16 x bfloat> @concat_zero_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; X86-LABEL: concat_zero_v8bf16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps %xmm0, %xmm0
+; X86-NEXT:    retl
+;
 ; SSE2-LABEL: concat_zero_v8bf16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1

From 0cdaadf15aaa6609f93e3508417b47baa3891996 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski@amd.com>
Date: Thu, 4 Jan 2024 08:45:34 +0100
Subject: [PATCH 197/313] [libomptarget][flang] Explicitly pass the OpenMP
 device libraries to tests (#76796)

This pull request is a follow-up of patch:
https://github.com/llvm/llvm-project/pull/68225 and it explicitly
specifies OpenMP device libraries for Fortran OpenMP tests.
---
 openmp/libomptarget/test/lit.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 19c5e5c457222..9078561e3198e 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -247,7 +247,7 @@ for libomptarget_target in config.libomptarget_all_targets:
             "%clang-" + libomptarget_target + add_libraries(" %s -o %t")))
         config.substitutions.append(("%libomptarget-compile-fortran-" + \
             libomptarget_target, \
-            "%flang-" + libomptarget_target + " %s -o %t"))
+            "%flang-" + libomptarget_target + add_libraries(" %s -o %t")))
         config.substitutions.append(("%libomptarget-compileoptxx-run-and-check-" + \
             libomptarget_target, \
             "%libomptarget-compileoptxx-and-run-" + libomptarget_target + \

From 75365b2e189a54b96a907ebe327c5898dd7aad14 Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Thu, 4 Jan 2024 15:51:57 +0800
Subject: [PATCH 198/313] [clang][AVR] Restrict range of assembly constraint
 'G' (#76561)

According to
https://www.nongnu.org/avr-libc/user-manual/inline_asm.html, 'G' only
represents floating point constant '0.0'. And avr-gcc also rejects other
non-zero FP values.
---
 clang/lib/Basic/Targets/AVR.h                                 | 4 +++-
 clang/test/CodeGen/avr/avr-inline-asm-constraints.c           | 4 ++--
 .../test/CodeGen/avr/avr-unsupported-inline-asm-constraints.c | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Basic/Targets/AVR.h b/clang/lib/Basic/Targets/AVR.h
index 854a51d78c393..9376c46cd98ca 100644
--- a/clang/lib/Basic/Targets/AVR.h
+++ b/clang/lib/Basic/Targets/AVR.h
@@ -146,7 +146,9 @@ class LLVM_LIBRARY_VISIBILITY AVRTargetInfo : public TargetInfo {
     case 'R': // Integer constant (Range: -6 to 5)
       Info.setRequiresImmediate(-6, 5);
       return true;
-    case 'G': // Floating point constant
+    case 'G': // Floating point constant 0.0
+      Info.setRequiresImmediate(0);
+      return true;
     case 'Q': // A memory address based on Y or Z pointer with displacement.
       return true;
     }
diff --git a/clang/test/CodeGen/avr/avr-inline-asm-constraints.c b/clang/test/CodeGen/avr/avr-inline-asm-constraints.c
index 96774861feb22..3a956de8db48f 100644
--- a/clang/test/CodeGen/avr/avr-inline-asm-constraints.c
+++ b/clang/test/CodeGen/avr/avr-inline-asm-constraints.c
@@ -109,8 +109,8 @@ void R() {
 }
 
 void G() {
-  // CHECK: call addrspace(0) void asm sideeffect "subi r30, $0", "G"(i16 50)
-  asm("subi r30, %0" :: "G"(50));
+  // CHECK: call addrspace(0) void asm sideeffect "subi r30, $0", "G"(i16 0)
+  asm("subi r30, %0" :: "G"(0));
 }
 
 void Q() {
diff --git a/clang/test/CodeGen/avr/avr-unsupported-inline-asm-constraints.c b/clang/test/CodeGen/avr/avr-unsupported-inline-asm-constraints.c
index ceea59229f736..29f0b69285fa8 100644
--- a/clang/test/CodeGen/avr/avr-unsupported-inline-asm-constraints.c
+++ b/clang/test/CodeGen/avr/avr-unsupported-inline-asm-constraints.c
@@ -5,4 +5,5 @@ const unsigned char val = 0;
 int foo(void) {
   __asm__ volatile("foo %0, 1" : : "fo" (val)); // expected-error {{invalid input constraint 'fo' in asm}}
   __asm__ volatile("foo %0, 1" : : "Nd" (val)); // expected-error {{invalid input constraint 'Nd' in asm}}
+  __asm__ volatile("subi r30, %0" : : "G" (1)); // expected-error {{value '1' out of range for constraint 'G'}}
 }

From 18c0f59b3eeed880384aa37c7feca2064b994f37 Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Thu, 4 Jan 2024 15:52:52 +0800
Subject: [PATCH 199/313] [clang][analyzer] Support 'fdopen' in the
 StreamChecker (#76776)

---
 clang/docs/ReleaseNotes.rst                         |  5 +++--
 clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp |  1 +
 .../test/Analysis/Inputs/system-header-simulator.h  |  1 +
 clang/test/Analysis/stream-error.c                  | 13 +++++++++++--
 clang/test/Analysis/stream-non-posix-function.c     |  7 ++++++-
 clang/test/Analysis/stream-note.c                   | 10 ++++++++++
 clang/test/Analysis/stream-stdlibraryfunctionargs.c |  8 ++++++++
 clang/test/Analysis/stream.c                        |  7 +++++++
 8 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a3107c4a69532..30cfe66703f5a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1162,8 +1162,9 @@ Improvements
   `0954dc3fb921 <https://github.com/llvm/llvm-project/commit/0954dc3fb9214b994623f5306473de075f8e3593>`_)
 
 - Improved the ``alpha.unix.Stream`` checker by modeling more functions like,
-  ``fflush``, ``fputs``, ``fgetc``, ``fputc``, ``fopen``, ``fopen``, ``fgets``.
-  (`#74296 <https://github.com/llvm/llvm-project/pull/74296>`_,
+  ``fflush``, ``fputs``, ``fgetc``, ``fputc``, ``fopen``, ``fdopen``, ``fgets``, ``tmpfile``.
+  (`#76776 <https://github.com/llvm/llvm-project/pull/76776>`_,
+  `#74296 <https://github.com/llvm/llvm-project/pull/74296>`_,
   `#73335 <https://github.com/llvm/llvm-project/pull/73335>`_,
   `#72627 <https://github.com/llvm/llvm-project/pull/72627>`_,
   `#71518 <https://github.com/llvm/llvm-project/pull/71518>`_,
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 254b36ed03968..25da3c18e8519 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -239,6 +239,7 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
 private:
   CallDescriptionMap<FnDescription> FnDescriptions = {
       {{{"fopen"}, 2}, {nullptr, &StreamChecker::evalFopen, ArgNone}},
+      {{{"fdopen"}, 2}, {nullptr, &StreamChecker::evalFopen, ArgNone}},
       {{{"freopen"}, 3},
        {&StreamChecker::preFreopen, &StreamChecker::evalFreopen, 2}},
       {{{"tmpfile"}, 0}, {nullptr, &StreamChecker::evalFopen, ArgNone}},
diff --git a/clang/test/Analysis/Inputs/system-header-simulator.h b/clang/test/Analysis/Inputs/system-header-simulator.h
index 409a969a0d4cc..8c43c48c6a3e4 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator.h
@@ -43,6 +43,7 @@ FILE *funopen(const void *,
               int (*)(void *));
 
 FILE *fopen(const char *restrict path, const char *restrict mode);
+FILE *fdopen(int fd, const char *mode);
 FILE *tmpfile(void);
 FILE *freopen(const char *restrict pathname, const char *restrict mode, FILE *restrict stream);
 int fclose(FILE *fp);
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index 37e1e54dfc89d..13c6684b5840a 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -21,6 +21,15 @@ void error_fopen(void) {
   fclose(F);
 }
 
+void error_fdopen(int fd) {
+  FILE *F = fdopen(fd, "r");
+  if (!F)
+    return;
+  clang_analyzer_eval(feof(F));   // expected-warning {{FALSE}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{FALSE}}
+  fclose(F);
+}
+
 void error_freopen(void) {
   FILE *F = fopen("file", "r");
   if (!F)
@@ -146,8 +155,8 @@ void error_fgets(void) {
   fgets(Buf, sizeof(Buf), F);         // expected-warning {{Stream might be already closed}}
 }
 
-void error_fputc(void) {
-  FILE *F = tmpfile();
+void error_fputc(int fd) {
+  FILE *F = fdopen(fd, "w");
   if (!F)
     return;
   int Ret = fputc('X', F);
diff --git a/clang/test/Analysis/stream-non-posix-function.c b/clang/test/Analysis/stream-non-posix-function.c
index c6dc65afe788b..091d95a573ddf 100644
--- a/clang/test/Analysis/stream-non-posix-function.c
+++ b/clang/test/Analysis/stream-non-posix-function.c
@@ -8,11 +8,16 @@ typedef struct _FILE FILE;
 // These functions are not standard C library functions.
 FILE *tmpfile(const char *restrict path); // Real 'tmpfile' should have exactly 0 formal parameters.
 FILE *fopen(const char *restrict path);   // Real 'fopen' should have exactly 2 formal parameters.
+FILE *fdopen(int fd);                     // Real 'fdopen' should have exactly 2 formal parameters.
 
 void test_fopen_non_posix(void) {
   FILE *fp = fopen("file"); // no-leak: This isn't the standard POSIX `fopen`, we don't know the semantics of this call.
 }
 
 void test_tmpfile_non_posix(void) {
-  FILE *fp = tmpfile("file"); // // no-leak: This isn't the standard POSIX `tmpfile`, we don't know the semantics of this call.
+  FILE *fp = tmpfile("file"); // no-leak: This isn't the standard POSIX `tmpfile`, we don't know the semantics of this call.
+}
+
+void test_fdopen_non_posix(int fd) {
+  FILE *fp = fdopen(fd); // no-leak: This isn't the standard POSIX `fdopen`, we don't know the semantics of this call.
 }
diff --git a/clang/test/Analysis/stream-note.c b/clang/test/Analysis/stream-note.c
index b9fdc16b19e55..e412015eb6839 100644
--- a/clang/test/Analysis/stream-note.c
+++ b/clang/test/Analysis/stream-note.c
@@ -54,6 +54,16 @@ void check_note_freopen(void) {
 // expected-warning@-1 {{Opened stream never closed. Potential resource leak}}
 // expected-note@-2 {{Opened stream never closed. Potential resource leak}}
 
+void check_note_fdopen(int fd) {
+  FILE *F = fdopen(fd, "r"); // expected-note {{Stream opened here}}
+  if (!F)
+    // expected-note@-1 {{'F' is non-null}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+}
+// expected-warning@-1 {{Opened stream never closed. Potential resource leak}}
+// expected-note@-2 {{Opened stream never closed. Potential resource leak}}
+
 void check_note_leak_2(int c) {
   FILE *F1 = fopen("foo1.c", "r"); // expected-note {{Stream opened here}}
   // stdargs-note@-1 {{'fopen' is successful}}
diff --git a/clang/test/Analysis/stream-stdlibraryfunctionargs.c b/clang/test/Analysis/stream-stdlibraryfunctionargs.c
index 938901ec08829..0053510163efc 100644
--- a/clang/test/Analysis/stream-stdlibraryfunctionargs.c
+++ b/clang/test/Analysis/stream-stdlibraryfunctionargs.c
@@ -23,6 +23,14 @@ void test_fopen(void) {
   // stdfunc-warning{{should not be NULL}}
 }
 
+void test_fdopen(int fd) {
+  FILE *fp = fdopen(fd, "r");
+  clang_analyzer_eval(fp != NULL); // any-warning{{TRUE}} any-warning{{FALSE}}
+  fclose(fp); // \
+  // stream-warning{{Stream pointer might be NULL}} \
+  // stdfunc-warning{{should not be NULL}}
+}
+
 void test_tmpfile(void) {
   FILE *fp = tmpfile();
   clang_analyzer_eval(fp != NULL); // any-warning{{TRUE}} any-warning{{FALSE}}
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index 0b655dbdc571f..060d561c1fe1c 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -102,6 +102,13 @@ void f_open(void) {
   fclose(p);
 }
 
+void f_dopen(int fd) {
+  FILE *F = fdopen(fd, "r");
+  char buf[1024];
+  fread(buf, 1, 1, F); // expected-warning {{Stream pointer might be NULL}}
+  fclose(F);
+}
+
 void f_seek(void) {
   FILE *p = fopen("foo", "r");
   if (!p)

From 8c72ff716b3e4b298695fa3faf6add860c6dbcb2 Mon Sep 17 00:00:00 2001
From: bgra8 <32298056+bgra8@users.noreply.github.com>
Date: Thu, 4 Jan 2024 08:25:57 +0000
Subject: [PATCH 200/313] [NFC] Renames a template parameter to avoid clashes
 with userspace names. (#76829)

Co-authored-by: Bogdan Graur <bgraur@google.com>
---
 libcxx/include/__format/format_arg_store.h      | 8 ++++----
 libcxx/test/libcxx/system_reserved_names.gen.py | 3 +++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index c481992d2d719..066cd369eb891 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -228,15 +228,15 @@ _LIBCPP_HIDE_FROM_ABI void __store_basic_format_arg(basic_format_arg<_Context>*
   ([&] { *__data++ = __format::__create_format_arg<_Context>(__args); }(), ...);
 }
 
-template <class _Context, size_t N>
+template <class _Context, size_t _Np>
 struct __packed_format_arg_store {
-  __basic_format_arg_value<_Context> __values_[N];
+  __basic_format_arg_value<_Context> __values_[_Np];
   uint64_t __types_ = 0;
 };
 
-template <class _Context, size_t N>
+template <class _Context, size_t _Np>
 struct __unpacked_format_arg_store {
-  basic_format_arg<_Context> __args_[N];
+  basic_format_arg<_Context> __args_[_Np];
 };
 
 } // namespace __format
diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py
index 8c4be97897f65..8ddd035ff2d25 100644
--- a/libcxx/test/libcxx/system_reserved_names.gen.py
+++ b/libcxx/test/libcxx/system_reserved_names.gen.py
@@ -131,6 +131,9 @@
 #define E SYSTEM_RESERVED_NAME
 #define Ep SYSTEM_RESERVED_NAME
 #define Es SYSTEM_RESERVED_NAME
+#define N SYSTEM_RESERVED_NAME
+#define Np SYSTEM_RESERVED_NAME
+#define Ns SYSTEM_RESERVED_NAME
 #define R SYSTEM_RESERVED_NAME
 #define Rp SYSTEM_RESERVED_NAME
 #define Rs SYSTEM_RESERVED_NAME

From f3f4387e02b0ed637b5d843e8937116334329a65 Mon Sep 17 00:00:00 2001
From: gmh <13917777+gmh5225@users.noreply.github.com>
Date: Thu, 4 Jan 2024 16:39:50 +0800
Subject: [PATCH 201/313] [lldb][NFC] Fix compilation issue on windows (#76453)

---
 .../Plugins/Process/Windows/Common/TargetThreadWindows.cpp    | 4 ++--
 lldb/unittests/Thread/ThreadTest.cpp                          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
index ad67e764fe10f..a69c10081ff19 100644
--- a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
@@ -29,8 +29,8 @@
 using namespace lldb;
 using namespace lldb_private;
 
-using GetThreadDescriptionFunctionPtr = HRESULT
-WINAPI (*)(HANDLE hThread, PWSTR *ppszThreadDescription);
+using GetThreadDescriptionFunctionPtr =
+    HRESULT(WINAPI *)(HANDLE hThread, PWSTR *ppszThreadDescription);
 
 TargetThreadWindows::TargetThreadWindows(ProcessWindows &process,
                                          const HostThread &thread)
diff --git a/lldb/unittests/Thread/ThreadTest.cpp b/lldb/unittests/Thread/ThreadTest.cpp
index 4c660e9815c3e..542585969c07b 100644
--- a/lldb/unittests/Thread/ThreadTest.cpp
+++ b/lldb/unittests/Thread/ThreadTest.cpp
@@ -34,8 +34,8 @@ using namespace lldb;
 namespace {
 
 #ifdef _WIN32
-using SetThreadDescriptionFunctionPtr = HRESULT
-WINAPI (*)(HANDLE hThread, PCWSTR lpThreadDescription);
+using SetThreadDescriptionFunctionPtr =
+    HRESULT(WINAPI *)(HANDLE hThread, PCWSTR lpThreadDescription);
 
 static SetThreadDescriptionFunctionPtr SetThreadName;
 #endif

From 232be5129b205b0eefee52b7ff930d597298bbfc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Thu, 4 Jan 2024 00:39:49 -0800
Subject: [PATCH 202/313] [NFC][llvm-exegesis] Fix comment text

I'm reasonably confident iff wasn't an abbreviation for if and only if
here and it was starting to bug me quite a bit.
---
 llvm/tools/llvm-exegesis/llvm-exegesis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index 1b35fde815f11..ffbf94ce0fcb2 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -437,7 +437,7 @@ static void runBenchmarkConfigurations(
            ArrayRef<Benchmark>(AllResults).drop_front()) {
         llvm::append_range(Result.AssembledSnippet,
                            OtherResult.AssembledSnippet);
-        // Aggregate measurements, but only iff all measurements succeeded.
+        // Aggregate measurements, but only if all measurements succeeded.
         if (Result.Measurements.empty())
           continue;
         assert(OtherResult.Measurements.size() == Result.Measurements.size() &&

From f8c034140b577c81ddaff3eec9e4af0db1c6c355 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 4 Jan 2024 08:48:44 +0000
Subject: [PATCH 203/313] [mlir][docs] Update TD tutorial - Ch0 (#76858)

Updates `generic` as `linalg.generic` (for consistency and to avoid
ambiguity) and a few other fixes.
---
 mlir/docs/Tutorials/transform/Ch0.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/docs/Tutorials/transform/Ch0.md b/mlir/docs/Tutorials/transform/Ch0.md
index eb3272cced288..43bcdf96d92b5 100644
--- a/mlir/docs/Tutorials/transform/Ch0.md
+++ b/mlir/docs/Tutorials/transform/Ch0.md
@@ -56,7 +56,7 @@ Even when special instructions are available, it may still be desirable to use t
 
 ## Contraction
 
-Contraction is a generalization of reduction that multiplies elements from two vectors before adding them up. A simple “add” reduction can be thought of as a contraction where one of the vectors contains `1.0`, the neutral element of multiplication. Contractions offer even more flexibility to the compiler, and are represented as by a dedicated operation in MLIR:
+Contraction is a generalization of reduction that multiplies elements from two vectors before adding them up. A simple “add” reduction can be thought of as a contraction where one of the vectors contains `1.0`, the neutral element of multiplication. Contractions offer even more flexibility to the compiler, and are represented by a dedicated operation in MLIR:
 
 ```mlir
 // Neutral initializer for the addition.
@@ -72,14 +72,14 @@ Contraction is a generalization of reduction that multiplies elements from two v
 } %0, %ones, %init : vector<8xf32>, vector<8xf32> into f32
 ```
 
-Note the `affine_map` expressions indicating how vector elements are indexed. Their meaning is perhaps most evident when writing the loop form pseudo-code equivalent to this contraction:
+Note the `affine_map` expressions indicating how vector elements are indexed. Their meaning is perhaps most evident when writing the loop form in pseudo-code equivalent to this contraction:
 
 ```mlir
 for i in 0 to 8:
   init += p0[i] * ones[i]
 ```
 
-where both `%0` and `%ones` use the loop induction variable `i`, as noted on the right-hand side of the corresponding affine map, `(i) -> (i)`, and the `%init` does not, as reflected on the right-hand side of its affine map, `(i) -> ()`.
+where both `%0` and `%ones` use the loop induction variable `i`, as noted on the right-hand side of the corresponding affine map, `(i) -> (i)`, and `%init` does not, as reflected on the right-hand side of its affine map, `(i) -> ()`.
 
 Similarly to uniform elementwise extension, MLIR vector contractions are not limited to 1D cases. In the 2D+ case, one can additionally specify which of the vector dimensions are being reduced and which ones are being preserved. This can be achieved by using the `iterator_types` attribute that specifies, for each dimension, whether it is being reduced (`"reduction"`) or preserved (`"parallel"`). Consider the following 3D contraction that encodes a matrix-matrix multiplication:
 
@@ -134,7 +134,7 @@ Furthermore, the operation now contains a region that explicitly specifies the m
 
 ## “Loop” Fusion
 
-Since the region of the `generic` operation can contain arbitrarily many operations, we can use it to express “fusion” of the implicit loops by simply having more operations chained in the region. For example, the common machine learning rectified linear unit layer (ReLU), which can be defined as `relu(x) = max(0, x)`, can be defined be expressed using the “compare-and-select” idiom in one `generic` operation, without the temporary buffer for the comparison result and without repeating the outer operation:
+Since the region of the `linalg.generic` operation can contain arbitrarily many operations, we can use it to express “fusion” of the implicit loops by simply having more operations chained in the region. For example, the common machine learning rectified linear unit layer (ReLU), which can be defined as `relu(x) = max(0, x)`, can be defined be expressed using the “compare-and-select” idiom in one `linalg.generic` operation, without the temporary buffer for the comparison result and without repeating the outer operation:
 
 ```mlir
 linalg.generic {
@@ -155,7 +155,7 @@ Such operations can be converted to loops or lowered into vector forms after spl
 
 Let us take one last step up on the abstraction ladder. MLIR provides a tensor abstraction that makes it easy for the compiler to reason about multidimensional yet regular data without having to solve complex problems such as alias analysis and dependency satisfaction, which would be necessary on multidimensional buffers. The tensor abstraction is very similar to the vector abstraction (major differences include the availability of unranked tensors, tensor layouts, and vectors being usable as elemental types of tensors but not of other vectors). Tensors are read-only, and operations updating a tensor produce a new tensor.
 
-The `generic` operation from above can lifted to operate on tensors instead of buffers:
+The `linalg.generic` operation from above can lifted to operate on tensors instead of buffers:
 
 ```mlir
 %result = linalg.generic {
@@ -174,15 +174,15 @@ The `generic` operation from above can lifted to operate on tensors instead of b
 
 As you can notice, most components of this operation remain identical to its buffer version. It has been specifically designed this way. The main difference, beside the operand types, is that the operation now produces a new result instead of updating the `out` buffer. The `out` operand is used only as the initialization value.
 
-If the `generic` operation had existed on vectors, it would have had the exact same structure.
+If the `linalg.generic` operation had existed on vectors, it would have had the exact same structure.
 
 ## Tiling and Loop Materialization
 
 At this level of abstraction, it becomes easy for the compiler to perform more advanced transformations usually required for high-performance code generation, such as [tiling](https://en.wikipedia.org/wiki/Loop_nest_optimization). Tiling, in general, can be seen as partitioning the iteration space into smaller parts, or tiles, so that the data required by each part fits into a level of cache for example. The order in which tiles are executed must preserve the original data dependencies.
 
-In the case of `generic` operations, the iteration space is implicit and is defined by the shape of the operands. Therefore, a tile can be expressed by performing the _same_ operation on a subset (slice) of the original data. Since the order in which the body of `generic` is applied to different tuples of the input elements is unspecified, tiles can be executed in any order, without the need for dependence analysis. In order to control the execution of different tiles, the implementation of tiling produces loops. Thus tiling `generic` operations can also be seen as materializing the loops that have been implicit until now.
+In the case of `linalg.generic` operations, the iteration space is implicit and is defined by the shape of the operands. Therefore, a tile can be expressed by performing the _same_ operation on a subset (slice) of the original data. Since the order in which the body of `linalg.generic` is applied to different tuples of the input elements is unspecified, tiles can be executed in any order, without the need for dependence analysis. In order to control the execution of different tiles, the implementation of tiling produces loops. Thus tiling `linalg.generic` operations can also be seen as materializing the loops that have been implicit until now.
 
-For example, tiling the matrix multiplication presented above with tile sizes `(2, 8)`, we obtain a loop nest around a `generic` expressing the same operation on a `2x8` tensor.
+For example, tiling the matrix multiplication presented above with tile sizes `(2, 8)`, we obtain a loop nest around a `linalg.generic` expressing the same operation on a `2x8` tensor.
 
 ```mlir
 // A special "multi-for" loop that supports tensor-insertion semantics 

From 3737712daeef9581e6388fad24f4fb56d89ce032 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <185856+superbobry@users.noreply.github.com>
Date: Thu, 4 Jan 2024 08:49:57 +0000
Subject: [PATCH 204/313] Slightly improved ir.pyi type annotations (#76728)

* Replaced `Any` with static types where appropriate
* Removed undocumented `__str__` and `__repr__` -- these are always
defined via `object`
---
 mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 84 +++---------------------
 1 file changed, 10 insertions(+), 74 deletions(-)

diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index fa591e5f142d9..57a85990f9bcf 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -460,11 +460,9 @@ class AffineExpr:
     @overload
     def __mul__(self, arg0: int) -> AffineMulExpr: ...
     def __radd__(self, arg0: int) -> AffineAddExpr: ...
-    def __repr__(self) -> str: ...
     def __rmod__(self, arg0: int) -> AffineModExpr: ...
     def __rmul__(self, arg0: int) -> AffineMulExpr: ...
     def __rsub__(self, arg0: int) -> AffineAddExpr: ...
-    def __str__(self) -> str: ...
     @overload
     def __sub__(self, arg0: AffineExpr) -> AffineAddExpr: ...
     @overload
@@ -495,7 +493,6 @@ class Attribute:
         """
         Casts the passed attribute to the generic Attribute
         """
-    def __repr__(self) -> str: ...
     def __str__(self) -> str:
         """
         Returns the assembly form of the Attribute.
@@ -541,7 +538,6 @@ class Type:
         """
         Casts the passed type to the generic Type
         """
-    def __repr__(self) -> str: ...
     def __str__(self) -> str:
         """
         Returns the assembly form of the type.
@@ -710,8 +706,6 @@ class AffineMap:
     @overload
     def __eq__(self, arg0: object) -> bool: ...
     def __hash__(self) -> int: ...
-    def __repr__(self) -> str: ...
-    def __str__(self) -> str: ...
     def dump(self) -> None:
         """
         Dumps a debug representation of the object to stderr.
@@ -756,7 +750,6 @@ class AffineMapAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def type(self) -> Type: ...
     @property
@@ -801,7 +794,6 @@ class ArrayAttr(Attribute):
         self,
     ) -> ArrayAttributeIterator: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def type(self) -> Type: ...
     @property
@@ -840,7 +832,6 @@ class BF16Type(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -951,7 +942,6 @@ class BoolAttr(Attribute):
         Converts the value of the bool attribute to a Python bool
         """
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -974,7 +964,6 @@ class ComplexType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def element_type(self) -> Type:
         """
@@ -989,7 +978,7 @@ class Context:
     @staticmethod
     def _get_live_count() -> int: ...
     def _CAPICreate(self) -> object: ...
-    def __enter__(self) -> Any: ...
+    def __enter__(self) -> Context: ...
     def __exit__(self, arg0: Any, arg1: Any, arg2: Any) -> None: ...
     def __init__(self) -> None: ...
     def _clear_live_operations(self) -> int: ...
@@ -1040,7 +1029,6 @@ class DenseBoolArrayAttr(Attribute):
         self,
     ) -> DenseBoolArrayIterator: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1111,7 +1099,6 @@ class DenseElementsAttr(Attribute):
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     def get_splat_value(self) -> Attribute: ...
     @property
     def is_splat(self) -> bool: ...
@@ -1139,7 +1126,6 @@ class DenseF32ArrayAttr(Attribute):
         self,
     ) -> DenseF32ArrayIterator: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1168,7 +1154,6 @@ class DenseF64ArrayAttr(Attribute):
         self,
     ) -> DenseF64ArrayIterator: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1185,7 +1170,6 @@ class DenseFPElementsAttr(DenseElementsAttr):
     def isinstance(other: Attribute) -> bool: ...
     def __getitem__(self, arg0: int) -> float: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1208,7 +1192,6 @@ class DenseI16ArrayAttr(Attribute):
         self,
     ) -> DenseI16ArrayIterator: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1235,7 +1218,6 @@ class DenseI32ArrayAttr(Attribute):
         self,
     ) -> DenseI32ArrayIterator: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1262,7 +1244,6 @@ class DenseI64ArrayAttr(Attribute):
         self,
     ) -> DenseI16ArrayIterator: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1289,7 +1270,6 @@ class DenseI8ArrayAttr(Attribute):
         self,
     ) -> DenseI8ArrayIterator: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1306,7 +1286,6 @@ class DenseIntElementsAttr(DenseElementsAttr):
     def isinstance(other: Attribute) -> bool: ...
     def __getitem__(self, arg0: int) -> int: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1352,7 +1331,6 @@ class DenseResourceElementsAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1361,7 +1339,6 @@ class DenseResourceElementsAttr(Attribute):
     def typeid(self) -> TypeID: ...
 
 class Diagnostic:
-    def __str__(self) -> str: ...
     @property
     def location(self) -> Location: ...
     @property
@@ -1382,7 +1359,6 @@ class DiagnosticHandler:
 
 class DiagnosticInfo:
     def __init__(self, arg0: Diagnostic) -> None: ...
-    def __str__(self) -> str: ...
     @property
     def location(self) -> Location: ...
     @property
@@ -1419,9 +1395,7 @@ class DiagnosticSeverity:
     def __init__(self, value: int) -> None: ...
     def __int__(self) -> int: ...
     def __ne__(self, other: Any) -> bool: ...
-    def __repr__(self) -> str: ...
     def __setstate__(self, state: int) -> None: ...
-    def __str__(self) -> str: ...
     @property
     def name(self) -> str: ...
     @property
@@ -1429,12 +1403,10 @@ class DiagnosticSeverity:
 
 class Dialect:
     def __init__(self, descriptor: DialectDescriptor) -> None: ...
-    def __repr__(self) -> Any: ...
     @property
     def descriptor(self) -> DialectDescriptor: ...
 
 class DialectDescriptor:
-    def __repr__(self) -> str: ...
     @property
     def namespace(self) -> str: ...
 
@@ -1445,8 +1417,8 @@ class DialectRegistry:
     def _CAPIPtr(self) -> object: ...
 
 class Dialects:
-    def __getattr__(self, arg0: str) -> Any: ...
-    def __getitem__(self, arg0: str) -> Any: ...
+    def __getattr__(self, arg0: str) -> Dialect: ...
+    def __getitem__(self, arg0: str) -> Dialect: ...
 
 class DictAttr(Attribute):
     static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
@@ -1464,7 +1436,6 @@ class DictAttr(Attribute):
     def __getitem__(self, arg0: int) -> NamedAttribute: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __len__(self) -> int: ...
-    def __repr__(self) -> str: ...
     @property
     def type(self) -> Type: ...
     @property
@@ -1480,7 +1451,6 @@ class F16Type(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1494,7 +1464,6 @@ class F32Type(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1508,7 +1477,6 @@ class F64Type(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1521,7 +1489,6 @@ class FlatSymbolRefAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -1544,7 +1511,6 @@ class Float8E4M3B11FNUZType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1558,7 +1524,6 @@ class Float8E4M3FNType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1572,7 +1537,6 @@ class Float8E4M3FNUZType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1586,7 +1550,6 @@ class Float8E5M2FNUZType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1600,7 +1563,6 @@ class Float8E5M2Type(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1628,7 +1590,6 @@ class FloatAttr(Attribute):
         Converts the value of the float attribute to a Python float
         """
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def type(self) -> Type: ...
     @property
@@ -1649,7 +1610,6 @@ class FloatTF32Type(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1665,7 +1625,6 @@ class FunctionType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def inputs(self) -> List:
         """
@@ -1689,7 +1648,6 @@ class IndexType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -1769,7 +1727,7 @@ class InsertionPoint:
         """
         Inserts before the block terminator.
         """
-    def __enter__(self) -> Any: ...
+    def __enter__(self) -> InsertionPoint: ...
     def __exit__(self, arg0: Any, arg1: Any, arg2: Any) -> None: ...
     @overload
     def __init__(self, block: Block) -> None:
@@ -1810,7 +1768,6 @@ class IntegerAttr(Attribute):
         """
         Converts the value of the integer attribute to a Python int
         """
-    def __repr__(self) -> str: ...
     @property
     def type(self) -> Type: ...
     @property
@@ -1840,8 +1797,6 @@ class IntegerSet:
     @overload
     def __eq__(self, arg0: object) -> bool: ...
     def __hash__(self) -> int: ...
-    def __repr__(self) -> str: ...
-    def __str__(self) -> str: ...
     def dump(self) -> None:
         """
         Dumps a debug representation of the object to stderr.
@@ -1908,7 +1863,6 @@ class IntegerType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def is_signed(self) -> bool:
         """
@@ -1984,7 +1938,6 @@ class Location:
     @overload
     def __eq__(self, arg0: Location) -> bool: ...
     def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ...
-    def __repr__(self) -> str: ...
     def emit_error(self, message: str) -> None:
         """
         Emits an error at this location
@@ -2018,7 +1971,6 @@ class MemRefType(ShapedType):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def affine_map(self) -> AffineMap:
         """
@@ -2039,12 +1991,12 @@ class MemRefType(ShapedType):
 
 class Module:
     @staticmethod
-    def create(loc: Optional[Location] = None) -> Any:
+    def create(loc: Optional[Location] = None) -> Module:
         """
         Creates an empty module
         """
     @staticmethod
-    def parse(asm: str, context: Optional[Context] = None) -> Any:
+    def parse(asm: str, context: Optional[Context] = None) -> Module:
         """
         Parses a module's assembly format from a string.
 
@@ -2053,7 +2005,7 @@ class Module:
         See also: https://mlir.llvm.org/docs/LangRef/
         """
     def _CAPICreate(self) -> Any: ...
-    def __str__(self) -> Any:
+    def __str__(self) -> str:
         """
         Gets the assembly form of the operation with default options.
 
@@ -2078,7 +2030,7 @@ class Module:
         Context that created the Module
         """
     @property
-    def operation(self) -> Any:
+    def operation(self) -> Operation:
         """
         Accesses the module as an operation
         """
@@ -2089,7 +2041,6 @@ class MLIRError(Exception):
     ) -> None: ...
 
 class NamedAttribute:
-    def __repr__(self) -> str: ...
     @property
     def attr(self) -> Attribute:
         """
@@ -2111,7 +2062,6 @@ class NoneType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -2202,7 +2152,6 @@ class OpView(_OperationBase):
         Parses a specific, generated OpView based on class level attributes
         """
     def __init__(self, operation: _OperationBase) -> None: ...
-    def __str__(self) -> str: ...
     @property
     def operation(self) -> _OperationBase: ...
     @property
@@ -2228,7 +2177,6 @@ class OpaqueAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def data(self) -> bytes:
         """
@@ -2256,7 +2204,6 @@ class OpaqueType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def data(self) -> str:
         """
@@ -2305,7 +2252,7 @@ class Operation(_OperationBase):
     @staticmethod
     def parse(
         source: str, *, source_name: str = "", context: Optional[Context] = None
-    ) -> Any:
+    ) -> Operation:
         """
         Parses an operation. Supports both text assembly format and binary bytecode format.
         """
@@ -2327,7 +2274,7 @@ class OperationIterator:
     def __next__(self) -> OpView: ...
 
 class OperationList:
-    def __getitem__(self, arg0: int) -> Any: ...
+    def __getitem__(self, arg0: int) -> OpView: ...
     def __iter__(self) -> OperationIterator: ...
     def __len__(self) -> int: ...
 
@@ -2346,7 +2293,6 @@ class RankedTensorType(ShapedType):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def encoding(self) -> Optional[Attribute]: ...
     @property
@@ -2401,7 +2347,6 @@ class ShapedType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     def get_dim_size(self, dim: int) -> int:
         """
         Returns the dim-th dimension of the given ranked shaped type.
@@ -2505,7 +2450,6 @@ class StridedLayoutAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def offset(self) -> int:
         """
@@ -2536,7 +2480,6 @@ class StringAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def type(self) -> Type: ...
     @property
@@ -2561,7 +2504,6 @@ class SymbolRefAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def static_typeid(self) -> TypeID: ...
     @property
@@ -2610,7 +2552,6 @@ class TupleType(Type):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     def get_type(self, pos: int) -> Type:
         """
         Returns the pos-th type in the Tuple type.
@@ -2633,7 +2574,6 @@ class TypeAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def type(self) -> Type: ...
     @property
@@ -2661,7 +2601,6 @@ class UnitAttr(Attribute):
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def type(self) -> Type: ...
     @property
@@ -2679,7 +2618,6 @@ class UnrankedMemRefType(ShapedType):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def memory_space(self) -> Optional[Attribute]:
         """
@@ -2698,7 +2636,6 @@ class UnrankedTensorType(ShapedType):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -2719,7 +2656,6 @@ class VectorType(ShapedType):
     @staticmethod
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
-    def __repr__(self) -> str: ...
     @property
     def scalable(self) -> bool: ...
     @property

From dd4dc2111e2a788046b0db81eb274f53591313ff Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Thu, 4 Jan 2024 03:55:47 -0500
Subject: [PATCH 205/313] nfc add cases for pr47156 and pr47155

---
 llvm/test/CodeGen/PowerPC/pr47155-47156.ll | 39 ++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr47155-47156.ll

diff --git a/llvm/test/CodeGen/PowerPC/pr47155-47156.ll b/llvm/test/CodeGen/PowerPC/pr47155-47156.ll
new file mode 100644
index 0000000000000..26aa92e83f7af
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr47155-47156.ll
@@ -0,0 +1,39 @@
+; REQUIRES: asserts
+
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -stop-after=postmisched -debug-only=machine-scheduler 2>&1 >/dev/null | FileCheck %s
+
+define void @pr47155() {
+; CHECK: *** Final schedule for %bb.0 ***
+; CHECK: ********** MI Scheduling **********
+; CHECK-NEXT: pr47155:%bb.0 entry
+; CHECK:      SU(0):   INLINEASM &"mtlr 31"{{.*}}implicit-def early-clobber $lr
+; CHECK:      Successors:
+; CHECK-NEXT:   SU(1): Ord  Latency=0 Barrier
+; CHECK-NEXT: SU(1):   INLINEASM &"mtlr 31"{{.*}}implicit-def early-clobber $lr8
+; CHECK:      Predecessors:
+; CHECK-NEXT:   SU(0): Ord  Latency=0 Barrier
+; CHECK-NEXT: ExitSU:
+entry:
+  call void asm sideeffect "mtlr 31", "~{lr}"()
+  call void asm sideeffect "mtlr 31", "~{lr8}"()
+  ret void
+}
+
+define void @pr47156(ptr %fn) {
+; CHECK: *** Final schedule for %bb.0 ***
+; CHECK: ********** MI Scheduling **********
+; CHECK-NEXT: pr47156:%bb.0 entry
+; CHECK:      SU(0):   INLINEASM &"mtctr 31"{{.*}}implicit-def early-clobber $ctr
+; CHECK-NOT:  Successors:
+; CHECK-NOT:  Predecessors:
+; CHECK:      SU(1):   MTCTR8 renamable $x3, implicit-def $ctr8
+; CHECK:      Successors:
+; CHECK-NEXT: ExitSU:
+; CHECK-NEXT: SU(2):
+entry:
+  call void asm sideeffect "mtctr 31", "~{ctr}"()
+  tail call void %fn()
+  ret void
+}
+

From 87f1cf04cde146634f060167fb57fedf63b99ce5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 4 Jan 2024 10:03:40 +0100
Subject: [PATCH 206/313] [ConstraintElim] Add tests for int phi with non-one
 step (NFC)

---
 .../monotonic-int-phis-multiples.ll           | 290 ++++++++++++++++++
 1 file changed, 290 insertions(+)
 create mode 100644 llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-multiples.ll

diff --git a/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-multiples.ll b/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-multiples.ll
new file mode 100644
index 0000000000000..035cea4d73246
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-multiples.ll
@@ -0,0 +1,290 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=constraint-elimination < %s | FileCheck %s
+
+define void @multiple_pow2(i64 %count) {
+; CHECK-LABEL: define void @multiple_pow2(
+; CHECK-SAME: i64 [[COUNT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = shl i64 [[COUNT]], 2
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %end = shl i64 %count, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %iv.next = add i64 %iv, 4
+  %cmp.i.not = icmp eq i64 %iv, %end
+  br i1 %cmp.i.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp2.i.i = icmp ult i64 %iv, %end
+  br i1 %cmp2.i.i, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @multiple_pow2_larger_than_needed(i64 %count) {
+; CHECK-LABEL: define void @multiple_pow2_larger_than_needed(
+; CHECK-SAME: i64 [[COUNT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = shl i64 [[COUNT]], 3
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %end = shl i64 %count, 3
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %iv.next = add i64 %iv, 4
+  %cmp.i.not = icmp eq i64 %iv, %end
+  br i1 %cmp.i.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp2.i.i = icmp ult i64 %iv, %end
+  br i1 %cmp2.i.i, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @multiple_pow2_too_small(i64 %count) {
+; CHECK-LABEL: define void @multiple_pow2_too_small(
+; CHECK-SAME: i64 [[COUNT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = shl i64 [[COUNT]], 1
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %end = shl i64 %count, 1
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %iv.next = add i64 %iv, 4
+  %cmp.i.not = icmp eq i64 %iv, %end
+  br i1 %cmp.i.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp2.i.i = icmp ult i64 %iv, %end
+  br i1 %cmp2.i.i, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @multiple_pow2_start_offset(i64 %count) {
+; CHECK-LABEL: define void @multiple_pow2_start_offset(
+; CHECK-SAME: i64 [[COUNT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = shl i64 [[COUNT]], 2
+; CHECK-NEXT:    [[PRECOND:%.*]] = icmp ugt i64 [[END]], 4
+; CHECK-NEXT:    br i1 [[PRECOND]], label [[LOOP:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 4, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %end = shl i64 %count, 2
+  %precond = icmp ugt i64 %end, 4
+  br i1 %precond, label %loop, label %exit
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 4, %entry ]
+  %iv.next = add i64 %iv, 4
+  %cmp.i.not = icmp eq i64 %iv, %end
+  br i1 %cmp.i.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp2.i.i = icmp ult i64 %iv, %end
+  br i1 %cmp2.i.i, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @multiple_pow2_wrong_start_offset(i64 %count) {
+; CHECK-LABEL: define void @multiple_pow2_wrong_start_offset(
+; CHECK-SAME: i64 [[COUNT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = shl i64 [[COUNT]], 2
+; CHECK-NEXT:    [[PRECOND:%.*]] = icmp ugt i64 [[END]], 1
+; CHECK-NEXT:    br i1 [[PRECOND]], label [[LOOP:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %end = shl i64 %count, 2
+  %precond = icmp ugt i64 %end, 1
+  br i1 %precond, label %loop, label %exit
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 1, %entry ]
+  %iv.next = add i64 %iv, 4
+  %cmp.i.not = icmp eq i64 %iv, %end
+  br i1 %cmp.i.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp2.i.i = icmp ult i64 %iv, %end
+  br i1 %cmp2.i.i, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @multiple_pow2_start_offset_dynamic(i64 %count) {
+; CHECK-LABEL: define void @multiple_pow2_start_offset_dynamic(
+; CHECK-SAME: i64 [[COUNT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[COUNT]], 2
+; CHECK-NEXT:    [[END:%.*]] = add i64 [[SHL]], 1
+; CHECK-NEXT:    [[PRECOND:%.*]] = icmp ne i64 [[END]], 0
+; CHECK-NEXT:    br i1 [[PRECOND]], label [[LOOP:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %shl = shl i64 %count, 2
+  %end = add i64 %shl, 1
+  %precond = icmp ne i64 %end, 0
+  br i1 %precond, label %loop, label %exit
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 1, %entry ]
+  %iv.next = add i64 %iv, 4
+  %cmp.i.not = icmp eq i64 %iv, %end
+  br i1 %cmp.i.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp2.i.i = icmp ult i64 %iv, %end
+  br i1 %cmp2.i.i, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @multiple_non_pow2_nuw(i64 %count) {
+; CHECK-LABEL: define void @multiple_non_pow2_nuw(
+; CHECK-SAME: i64 [[COUNT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = mul nuw i64 [[COUNT]], 3
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 3
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %end = mul nuw i64 %count, 3
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %iv.next = add i64 %iv, 3
+  %cmp.i.not = icmp eq i64 %iv, %end
+  br i1 %cmp.i.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp2.i.i = icmp ult i64 %iv, %end
+  br i1 %cmp2.i.i, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @multiple_non_pow2_missing_nuw(i64 %count) {
+; CHECK-LABEL: define void @multiple_non_pow2_missing_nuw(
+; CHECK-SAME: i64 [[COUNT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = mul i64 [[COUNT]], 3
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 3
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %end = mul i64 %count, 3
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %iv.next = add i64 %iv, 3
+  %cmp.i.not = icmp eq i64 %iv, %end
+  br i1 %cmp.i.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp2.i.i = icmp ult i64 %iv, %end
+  br i1 %cmp2.i.i, label %loop, label %exit
+
+exit:
+  ret void
+}

From 7954c57124b495fbdc73674d71f2e366e4afe522 Mon Sep 17 00:00:00 2001
From: Jannik Silvanus <37809848+jasilvanus@users.noreply.github.com>
Date: Thu, 4 Jan 2024 10:08:21 +0100
Subject: [PATCH 207/313] [IR] Fix GEP offset computations for vector GEPs
 (#75448)

Vectors are always bit-packed and don't respect the elements' alignment
requirements. This is different from arrays. This means offsets of
vector GEPs need to be computed differently than offsets of array GEPs.

This PR fixes many places that rely on an incorrect pattern
that always relies on `DL.getTypeAllocSize(GTI.getIndexedType())`.
We replace these by usages of  `GTI.getSequentialElementStride(DL)`,
which is a new helper function added in this PR.

This changes behavior for GEPs into vectors with element types for which
the (bit) size and alloc size is different. This includes two cases:

* Types with a bit size that is not a multiple of a byte, e.g. i1.
GEPs into such vectors are questionable to begin with, as some elements
  are not even addressable.
* Overaligned types, e.g. i16 with 32-bit alignment.

Existing tests are unaffected, but a miscompilation of a new test is fixed.

---------

Co-authored-by: Nikita Popov <github@npopov.com>
---
 clang/lib/CodeGen/CGExprScalar.cpp            |  4 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +-
 .../llvm/IR/GetElementPtrTypeIterator.h       | 57 ++++++++++++++++++-
 llvm/lib/Analysis/BasicAliasAnalysis.cpp      |  4 +-
 llvm/lib/Analysis/InlineCost.cpp              |  2 +-
 llvm/lib/Analysis/Local.cpp                   |  2 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  5 +-
 llvm/lib/Analysis/ValueTracking.cpp           |  4 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  2 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  2 +-
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp    |  6 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  2 +-
 .../ExecutionEngine/Interpreter/Execution.cpp |  2 +-
 llvm/lib/IR/DataLayout.cpp                    |  5 +-
 llvm/lib/IR/Operator.cpp                      | 12 ++--
 llvm/lib/IR/Value.cpp                         |  2 +-
 llvm/lib/Target/AArch64/AArch64FastISel.cpp   | 10 ++--
 llvm/lib/Target/ARM/ARMFastISel.cpp           |  2 +-
 llvm/lib/Target/Mips/MipsFastISel.cpp         |  2 +-
 llvm/lib/Target/PowerPC/PPCFastISel.cpp       |  2 +-
 .../RISCV/RISCVGatherScatterLowering.cpp      |  2 +-
 .../WebAssembly/WebAssemblyFastISel.cpp       |  2 +-
 llvm/lib/Target/X86/X86FastISel.cpp           |  2 +-
 llvm/lib/Transforms/Scalar/SROA.cpp           |  6 +-
 .../Scalar/SeparateConstOffsetFromGEP.cpp     |  6 +-
 .../Scalar/StraightLineStrengthReduce.cpp     |  2 +-
 .../Vectorize/LoadStoreVectorizer.cpp         |  2 +-
 .../Transforms/InstCombine/getelementptr.ll   | 12 +++-
 28 files changed, 109 insertions(+), 54 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 6adf99531e30e..d2c4c7ee50bc8 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -5292,8 +5292,8 @@ static GEPOffsetAndOverflow EmitGEPOffsetInBytes(Value *BasePtr, Value *GEPVal,
     } else {
       // Otherwise this is array-like indexing. The local offset is the index
       // multiplied by the element size.
-      auto *ElementSize = llvm::ConstantInt::get(
-          IntPtrTy, DL.getTypeAllocSize(GTI.getIndexedType()));
+      auto *ElementSize =
+          llvm::ConstantInt::get(IntPtrTy, GTI.getSequentialElementStride(DL));
       auto *IndexS = Builder.CreateIntCast(Index, IntPtrTy, /*isSigned=*/true);
       LocalOffset = eval(BO_Mul, ElementSize, IndexS);
     }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7ad3ce512a355..2be7256423e42 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1048,7 +1048,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
         if (TargetType->isScalableTy())
           return TTI::TCC_Basic;
         int64_t ElementSize =
-            DL.getTypeAllocSize(GTI.getIndexedType()).getFixedValue();
+            GTI.getSequentialElementStride(DL).getFixedValue();
         if (ConstIdx) {
           BaseOffset +=
               ConstIdx->getValue().sextOrTrunc(PtrSizeBits) * ElementSize;
diff --git a/llvm/include/llvm/IR/GetElementPtrTypeIterator.h b/llvm/include/llvm/IR/GetElementPtrTypeIterator.h
index f3272327c3f8b..1092b636e023a 100644
--- a/llvm/include/llvm/IR/GetElementPtrTypeIterator.h
+++ b/llvm/include/llvm/IR/GetElementPtrTypeIterator.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/User.h"
@@ -30,7 +31,39 @@ template <typename ItTy = User::const_op_iterator>
 class generic_gep_type_iterator {
 
   ItTy OpIt;
-  PointerUnion<StructType *, Type *> CurTy;
+  // We use two different mechanisms to store the type a GEP index applies to.
+  // In some cases, we need to know the outer aggregate type the index is
+  // applied within, e.g. a struct. In such cases, we store the aggregate type
+  // in the iterator, and derive the element type on the fly.
+  //
+  // However, this is not always possible, because for the outermost index there
+  // is no containing type. In such cases, or if the containing type is not
+  // relevant, e.g. for arrays, the element type is stored as Type* in CurTy.
+  //
+  // If CurTy contains a Type* value, this does not imply anything about the
+  // type itself, because it is the element type and not the outer type.
+  // In particular, Type* can be a struct type.
+  //
+  // Consider this example:
+  //
+  //    %my.struct = type { i32, [ 4 x float ] }
+  //    [...]
+  //    %gep = getelementptr %my.struct, ptr %ptr, i32 10, i32 1, 32 3
+  //
+  // Iterating over the indices of this GEP, CurTy will contain the following
+  // values:
+  //    * i32 10: The outer index always operates on the GEP value type.
+  //              CurTy contains a Type*       pointing at `%my.struct`.
+  //    * i32 1:  This index is within a struct.
+  //              CurTy contains a StructType* pointing at `%my.struct`.
+  //    * i32 3:  This index is within an array. We reuse the "flat" indexing
+  //              for arrays which is also used in the top level GEP index.
+  //              CurTy contains a Type*       pointing at `float`.
+  //
+  // Vectors are handled separately because the layout of vectors is different
+  // for overaligned elements: Vectors are always bit-packed, whereas arrays
+  // respect ABI alignment of the elements.
+  PointerUnion<StructType *, VectorType *, Type *> CurTy;
 
   generic_gep_type_iterator() = default;
 
@@ -69,6 +102,8 @@ class generic_gep_type_iterator {
   Type *getIndexedType() const {
     if (auto *T = dyn_cast_if_present<Type *>(CurTy))
       return T;
+    if (auto *VT = dyn_cast_if_present<VectorType *>(CurTy))
+      return VT->getElementType();
     return cast<StructType *>(CurTy)->getTypeAtIndex(getOperand());
   }
 
@@ -79,7 +114,7 @@ class generic_gep_type_iterator {
     if (auto *ATy = dyn_cast<ArrayType>(Ty))
       CurTy = ATy->getElementType();
     else if (auto *VTy = dyn_cast<VectorType>(Ty))
-      CurTy = VTy->getElementType();
+      CurTy = VTy;
     else
       CurTy = dyn_cast<StructType>(Ty);
     ++OpIt;
@@ -108,7 +143,23 @@ class generic_gep_type_iterator {
   // that.
 
   bool isStruct() const { return isa<StructType *>(CurTy); }
-  bool isSequential() const { return isa<Type *>(CurTy); }
+  bool isVector() const { return isa<VectorType *>(CurTy); }
+  bool isSequential() const { return !isStruct(); }
+
+  // For sequential GEP indices (all except those into structs), the index value
+  // can be translated into a byte offset by multiplying with an element stride.
+  // This function returns this stride, which both depends on the element type,
+  // and the containing aggregate type, as vectors always tightly bit-pack their
+  // elements.
+  TypeSize getSequentialElementStride(const DataLayout &DL) const {
+    assert(isSequential());
+    Type *ElemTy = getIndexedType();
+    if (isVector()) {
+      assert(DL.typeSizeEqualsStoreSize(ElemTy) && "Not byte-addressable");
+      return DL.getTypeStoreSize(ElemTy);
+    }
+    return DL.getTypeAllocSize(ElemTy);
+  }
 
   StructType *getStructType() const { return cast<StructType *>(CurTy); }
 
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 97f60d28e4991..9eb7e914687ce 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -639,7 +639,7 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
           continue;
 
         // Don't attempt to analyze GEPs if the scalable index is not zero.
-        TypeSize AllocTypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
+        TypeSize AllocTypeSize = GTI.getSequentialElementStride(DL);
         if (AllocTypeSize.isScalable()) {
           Decomposed.Base = V;
           return Decomposed;
@@ -650,7 +650,7 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
         continue;
       }
 
-      TypeSize AllocTypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
+      TypeSize AllocTypeSize = GTI.getSequentialElementStride(DL);
       if (AllocTypeSize.isScalable()) {
         Decomposed.Base = V;
         return Decomposed;
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 7096e06d925ad..1fa7badaa4fa0 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -1429,7 +1429,7 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
       continue;
     }
 
-    APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
+    APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
   }
   return true;
diff --git a/llvm/lib/Analysis/Local.cpp b/llvm/lib/Analysis/Local.cpp
index 30757abeb0980..f5e080d2c78e6 100644
--- a/llvm/lib/Analysis/Local.cpp
+++ b/llvm/lib/Analysis/Local.cpp
@@ -64,7 +64,7 @@ Value *llvm::emitGEPOffset(IRBuilderBase *Builder, const DataLayout &DL,
     // Convert to correct type.
     if (Op->getType() != IntIdxTy)
       Op = Builder->CreateIntCast(Op, IntIdxTy, true, Op->getName() + ".c");
-    TypeSize TSize = DL.getTypeAllocSize(GTI.getIndexedType());
+    TypeSize TSize = GTI.getSequentialElementStride(DL);
     if (TSize != TypeSize::getFixed(1)) {
       Value *Scale = Builder->CreateTypeSize(IntIdxTy->getScalarType(), TSize);
       if (IntIdxTy->isVectorTy())
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 89666018d9251..aed60cc5a3f5e 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2703,7 +2703,10 @@ static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
 
     // If it's a type with the same allocation size as the result of the GEP we
     // can peel off the zero index.
-    if (DL.getTypeAllocSize(GEPTI.getIndexedType()) != GEPAllocSize)
+    TypeSize ElemSize = GEPTI.isStruct()
+                            ? DL.getTypeAllocSize(GEPTI.getIndexedType())
+                            : GEPTI.getSequentialElementStride(DL);
+    if (ElemSize != GEPAllocSize)
       break;
     --LastOperand;
   }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 16d78c1ded6d7..439127e5c9540 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1196,7 +1196,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       unsigned IndexBitWidth = Index->getType()->getScalarSizeInBits();
       KnownBits IndexBits(IndexBitWidth);
       computeKnownBits(Index, IndexBits, Depth + 1, Q);
-      TypeSize IndexTypeSize = Q.DL.getTypeAllocSize(IndexedTy);
+      TypeSize IndexTypeSize = GTI.getSequentialElementStride(Q.DL);
       uint64_t TypeSizeInBytes = IndexTypeSize.getKnownMinValue();
       KnownBits ScalingFactor(IndexBitWidth);
       // Multiply by current sizeof type.
@@ -2128,7 +2128,7 @@ static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
     }
 
     // If we have a zero-sized type, the index doesn't matter. Keep looping.
-    if (Q.DL.getTypeAllocSize(GTI.getIndexedType()).isZero())
+    if (GTI.getSequentialElementStride(Q.DL).isZero())
       continue;
 
     // Fast path the constant operand case both for efficiency and so we don't
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 6e99fb133e26a..5bd4c6b067d79 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -4776,7 +4776,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
             cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
         ConstantOffset += SL->getElementOffset(Idx);
       } else {
-        TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType());
+        TypeSize TS = GTI.getSequentialElementStride(DL);
         if (TS.isNonZero()) {
           // The optimisations below currently only work for fixed offsets.
           if (TS.isScalable())
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index bea29642cd003..9c11113902a24 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1545,7 +1545,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
       Offset += DL->getStructLayout(StTy)->getElementOffset(Field);
       continue;
     } else {
-      uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+      uint64_t ElementSize = GTI.getSequentialElementStride(*DL);
 
       // If this is a scalar constant or a splat vector of constants,
       // handle it quickly.
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index f3d8edb8926b6..09e260394262b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -560,15 +560,13 @@ bool FastISel::selectGetElementPtr(const User *I) {
         }
       }
     } else {
-      Type *Ty = GTI.getIndexedType();
-
       // If this is a constant subscript, handle it quickly.
       if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero())
           continue;
         // N = N + Offset
         uint64_t IdxN = CI->getValue().sextOrTrunc(64).getSExtValue();
-        TotalOffs += DL.getTypeAllocSize(Ty) * IdxN;
+        TotalOffs += GTI.getSequentialElementStride(DL) * IdxN;
         if (TotalOffs >= MaxOffs) {
           N = fastEmit_ri_(VT, ISD::ADD, N, TotalOffs, VT);
           if (!N) // Unhandled operand. Halt "fast" selection and bail.
@@ -585,7 +583,7 @@ bool FastISel::selectGetElementPtr(const User *I) {
       }
 
       // N = N + Idx * ElementSize;
-      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      uint64_t ElementSize = GTI.getSequentialElementStride(DL);
       Register IdxN = getRegForGEPIndex(Idx);
       if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
         return false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 192f7bc8d2aa1..78ebd2d33459a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4114,7 +4114,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS);
       MVT IdxTy = MVT::getIntegerVT(IdxSize);
       TypeSize ElementSize =
-          DAG.getDataLayout().getTypeAllocSize(GTI.getIndexedType());
+          GTI.getSequentialElementStride(DAG.getDataLayout());
       // We intentionally mask away the high bits here; ElementSize may not
       // fit in IdxTy.
       APInt ElementMul(IdxSize, ElementSize.getKnownMinValue());
diff --git a/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp b/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
index 770fc93490835..ae978070ac9f9 100644
--- a/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -1074,7 +1074,7 @@ GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I,
         assert(BitWidth == 64 && "Invalid index type for getelementptr");
         Idx = (int64_t)IdxGV.IntVal.getZExtValue();
       }
-      Total += getDataLayout().getTypeAllocSize(I.getIndexedType()) * Idx;
+      Total += I.getSequentialElementStride(getDataLayout()) * Idx;
     }
   }
 
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index e28f043cf9e0d..a2f5714c70687 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -936,9 +936,8 @@ int64_t DataLayout::getIndexedOffsetInType(Type *ElemTy,
       // Add in the offset, as calculated by the structure layout info...
       Result += Layout->getElementOffset(FieldNo);
     } else {
-      // Get the array index and the size of each array element.
-      if (int64_t arrayIdx = cast<ConstantInt>(Idx)->getSExtValue())
-        Result += arrayIdx * getTypeAllocSize(GTI.getIndexedType());
+      if (int64_t ArrayIdx = cast<ConstantInt>(Idx)->getSExtValue())
+        Result += ArrayIdx * GTI.getSequentialElementStride(*this);
     }
   }
 
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index cd982c7da102a..16a89534b4b3e 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -87,7 +87,7 @@ Align GEPOperator::getMaxPreservedAlignment(const DataLayout &DL) const {
       /// If the index isn't known, we take 1 because it is the index that will
       /// give the worse alignment of the offset.
       const uint64_t ElemCount = OpC ? OpC->getZExtValue() : 1;
-      Offset = DL.getTypeAllocSize(GTI.getIndexedType()) * ElemCount;
+      Offset = GTI.getSequentialElementStride(DL) * ElemCount;
     }
     Result = Align(MinAlign(Offset, Result.value()));
   }
@@ -157,7 +157,7 @@ bool GEPOperator::accumulateConstantOffset(
         continue;
       }
       if (!AccumulateOffset(ConstOffset->getValue(),
-                            DL.getTypeAllocSize(GTI.getIndexedType())))
+                            GTI.getSequentialElementStride(DL)))
         return false;
       continue;
     }
@@ -170,8 +170,7 @@ bool GEPOperator::accumulateConstantOffset(
     if (!ExternalAnalysis(*V, AnalysisIndex))
       return false;
     UsedExternalAnalysis = true;
-    if (!AccumulateOffset(AnalysisIndex,
-                          DL.getTypeAllocSize(GTI.getIndexedType())))
+    if (!AccumulateOffset(AnalysisIndex, GTI.getSequentialElementStride(DL)))
       return false;
   }
   return true;
@@ -218,14 +217,13 @@ bool GEPOperator::collectOffset(
         continue;
       }
       CollectConstantOffset(ConstOffset->getValue(),
-                            DL.getTypeAllocSize(GTI.getIndexedType()));
+                            GTI.getSequentialElementStride(DL));
       continue;
     }
 
     if (STy || ScalableType)
       return false;
-    APInt IndexedSize =
-        APInt(BitWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
+    APInt IndexedSize = APInt(BitWidth, GTI.getSequentialElementStride(DL));
     // Insert an initial offset of 0 for V iff none exists already, then
     // increment the offset by IndexedSize.
     if (!IndexedSize.isZero()) {
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index b6e25c46b514d..94b0ae7435c94 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -1015,7 +1015,7 @@ getOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, const DataLayout &DL) {
 
     // Otherwise, we have a sequential type like an array or fixed-length
     // vector. Multiply the index by the ElementSize.
-    TypeSize Size = DL.getTypeAllocSize(GTI.getIndexedType());
+    TypeSize Size = GTI.getSequentialElementStride(DL);
     if (Size.isScalable())
       return std::nullopt;
     Offset += Size.getFixedValue() * OpC->getSExtValue();
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 1ea63a5d6ec08..e98f6c4984a75 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -645,7 +645,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
         unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
         TmpOffset += SL->getElementOffset(Idx);
       } else {
-        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        uint64_t S = GTI.getSequentialElementStride(DL);
         while (true) {
           if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
             // Constant-offset addressing.
@@ -4978,15 +4978,13 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
       if (Field)
         TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
     } else {
-      Type *Ty = GTI.getIndexedType();
-
       // If this is a constant subscript, handle it quickly.
       if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero())
           continue;
         // N = N + Offset
-        TotalOffs +=
-            DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
+        TotalOffs += GTI.getSequentialElementStride(DL) *
+                     cast<ConstantInt>(CI)->getSExtValue();
         continue;
       }
       if (TotalOffs) {
@@ -4997,7 +4995,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
       }
 
       // N = N + Idx * ElementSize;
-      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      uint64_t ElementSize = GTI.getSequentialElementStride(DL);
       unsigned IdxN = getRegForGEPIndex(Idx);
       if (!IdxN)
         return false;
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 1d6aaeb7433b0..cb3a709f7003b 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -747,7 +747,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
           unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
           TmpOffset += SL->getElementOffset(Idx);
         } else {
-          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+          uint64_t S = GTI.getSequentialElementStride(DL);
           while (true) {
             if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
               // Constant-offset addressing.
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 7fcf375aa10b6..192ed1cec79a8 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -492,7 +492,7 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
         unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
         TmpOffset += SL->getElementOffset(Idx);
       } else {
-        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        uint64_t S = GTI.getSequentialElementStride(DL);
         while (true) {
           if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
             // Constant-offset addressing.
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 42f5a4e624c49..56af80f9cedee 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -350,7 +350,7 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
           unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
           TmpOffset += SL->getElementOffset(Idx);
         } else {
-          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+          uint64_t S = GTI.getSequentialElementStride(DL);
           for (;;) {
             if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
               // Constant-offset addressing.
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index 5ad1e082344e7..1129206800ad3 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -362,7 +362,7 @@ RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr,
 
     VecOperand = i;
 
-    TypeSize TS = DL->getTypeAllocSize(GTI.getIndexedType());
+    TypeSize TS = GTI.getSequentialElementStride(*DL);
     if (TS.isScalable())
       return std::make_pair(nullptr, nullptr);
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 37abbb072cdd3..15dc44a043957 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -278,7 +278,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
         unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
         TmpOffset += SL->getElementOffset(Idx);
       } else {
-        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        uint64_t S = GTI.getSequentialElementStride(DL);
         for (;;) {
           if (const auto *CI = dyn_cast<ConstantInt>(Op)) {
             // Constant-offset addressing.
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 0ba31e173a1a7..c789ac82ac943 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -916,7 +916,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 
       // A array/variable index is always of the form i*S where S is the
       // constant scale size.  See if we can push the scale into immediates.
-      uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+      uint64_t S = GTI.getSequentialElementStride(DL);
       for (;;) {
         if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
           // Constant-offset addressing.
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 656abdb0abbff..75cddfa16d6db 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1097,10 +1097,8 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
           // For array or vector indices, scale the index by the size of the
           // type.
           APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
-          GEPOffset +=
-              Index *
-              APInt(Offset.getBitWidth(),
-                    DL.getTypeAllocSize(GTI.getIndexedType()).getFixedValue());
+          GEPOffset += Index * APInt(Offset.getBitWidth(),
+                                     GTI.getSequentialElementStride(DL));
         }
 
         // If this index has computed an intermediate pointer which is not
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index b8c9d9d100f11..225dd454068c8 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -843,7 +843,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
         // constant offset to a byte offset, and later offset the remainder of
         // the original GEP with this byte offset.
         AccumulativeByteOffset +=
-            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+            ConstantOffset * GTI.getSequentialElementStride(*DL);
       }
     } else if (LowerGEP) {
       StructType *StTy = GTI.getStructType();
@@ -884,7 +884,7 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
           continue;
 
       APInt ElementSize = APInt(PtrIndexTy->getIntegerBitWidth(),
-                                DL->getTypeAllocSize(GTI.getIndexedType()));
+                                GTI.getSequentialElementStride(*DL));
       // Scale the index by element size.
       if (ElementSize != 1) {
         if (ElementSize.isPowerOf2()) {
@@ -946,7 +946,7 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
           continue;
 
       APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
-                                DL->getTypeAllocSize(GTI.getIndexedType()));
+                                GTI.getSequentialElementStride(*DL));
       // Scale the index by element size.
       if (ElementSize != 1) {
         if (ElementSize.isPowerOf2()) {
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 543469d62fe73..ca1f3a0c0ae34 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -547,7 +547,7 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
     // indices except this current one.
     const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
     Value *ArrayIdx = GEP->getOperand(I);
-    uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+    uint64_t ElementSize = GTI.getSequentialElementStride(*DL);
     if (ArrayIdx->getType()->getIntegerBitWidth() <=
         DL->getIndexSizeInBits(GEP->getAddressSpace())) {
       // Skip factoring if ArrayIdx is wider than the index size, because
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index fa2459d1ca028..1f11d4894f775 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1193,7 +1193,7 @@ std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
       OpA->getType() != OpB->getType())
     return std::nullopt;
 
-  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+  uint64_t Stride = GTIA.getSequentialElementStride(DL);
 
   // Only look through a ZExt/SExt.
   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 373b7f5f2fc0a..642c3eb2a0e41 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-target datalayout = "e-p:64:64-p1:16:16-p2:32:32:32-p3:64:64:64"
+target datalayout = "e-p:64:64-p1:16:16-p2:32:32:32-p3:64:64:64-f16:32"
 
 %intstruct = type { i32 }
 %pair = type { i32, i32 }
@@ -111,6 +111,16 @@ define void @test_evaluate_gep_as_ptrs_array(ptr addrspace(2) %B) {
   ret void
 }
 
+; This should be turned into a constexpr instead of being an instruction
+define void @test_overaligned_vec(i8 %B) {
+; CHECK-LABEL: @test_overaligned_vec(
+; CHECK-NEXT:    store i8 [[B:%.*]], ptr getelementptr inbounds ([10 x i8], ptr @Global, i64 0, i64 2), align 1
+; CHECK-NEXT:    ret void
+  %A = getelementptr <2 x half>, ptr @Global, i64 0, i64 1
+  store i8 %B, ptr %A
+  ret void
+}
+
 define ptr @test7(ptr %I, i64 %C, i64 %D) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr i32, ptr [[I:%.*]], i64 [[C:%.*]]

From 2bd6642533ce858c07f1c412e1b8a669c17afb54 Mon Sep 17 00:00:00 2001
From: drblallo <blalloscompany@gmail.com>
Date: Thu, 4 Jan 2024 10:28:12 +0100
Subject: [PATCH 208/313] [mlir][dataflow]Fix dense backward dataflow
 intraprocedural hook (#76865)

The dataflow analysis framework within MLIR allows to customize the
transfer function when a `call-like` operation is encuntered.

The check to see if the analysis was executed in intraprocedural mode
was executed after the check to see if the callee had the
CallableOpInterface, and thus intraprocedural analyses would behave as
interpocedural ones when performing indirect calls.

This commit fixes the issue by performing the check for
intraprocedurality first.

Dense forward analyses were already behaving correctly.
https://github.com/llvm/llvm-project/blob/main/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp#L63

Co-authored-by: massimo <mo.fioravanti@gmail.com>
---
 mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp   | 16 +++++++++++-----
 .../Analysis/DataFlow/test-next-access.mlir    | 18 ++++++++++++++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
index 08d89d6db788c..d4b9134ab9eea 100644
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -283,17 +283,23 @@ void AbstractDenseBackwardDataFlowAnalysis::visitCallOperation(
     AbstractDenseLattice *before) {
   // Find the callee.
   Operation *callee = call.resolveCallable(&symbolTable);
-  auto callable = dyn_cast_or_null<CallableOpInterface>(callee);
-  if (!callable)
-    return setToExitState(before);
 
+  auto callable = dyn_cast_or_null<CallableOpInterface>(callee);
   // No region means the callee is only declared in this module.
-  Region *region = callable.getCallableRegion();
-  if (!region || region->empty() || !getSolverConfig().isInterprocedural()) {
+  // If that is the case or if the solver is not interprocedural,
+  // let the hook handle it.
+  if (!getSolverConfig().isInterprocedural() ||
+      (callable && (!callable.getCallableRegion() ||
+                    callable.getCallableRegion()->empty()))) {
     return visitCallControlFlowTransfer(
         call, CallControlFlowAction::ExternalCallee, after, before);
   }
 
+  if (!callable)
+    return setToExitState(before);
+
+  Region *region = callable.getCallableRegion();
+
   // Call-level control flow specifies the data flow here.
   //
   //   func.func @callee() {
diff --git a/mlir/test/Analysis/DataFlow/test-next-access.mlir b/mlir/test/Analysis/DataFlow/test-next-access.mlir
index 70069b10a9398..8825c699dd130 100644
--- a/mlir/test/Analysis/DataFlow/test-next-access.mlir
+++ b/mlir/test/Analysis/DataFlow/test-next-access.mlir
@@ -575,3 +575,21 @@ func.func @call_opaque_callee(%arg0: memref<f32>) {
   memref.load %arg0[] {name = "post"} : memref<f32>
   return
 }
+
+// -----
+
+// CHECK-LABEL: @indirect_call
+func.func @indirect_call(%arg0: memref<f32>, %arg1: (memref<f32>) -> ()) {
+  // IP:         name = "pre"
+  // IP-SAME:    next_access = ["unknown"]
+  // IP_AR:      name = "pre"
+  // IP_AR-SAME: next_access = ["unknown"] 
+  // LOCAL:      name = "pre"
+  // LOCAL-SAME: next_access = ["unknown"]
+  // LC_AR:      name = "pre"
+  // LC_AR-SAME: next_access = {{\[}}["call"]]
+  memref.load %arg0[] {name = "pre"} : memref<f32>
+  func.call_indirect %arg1(%arg0) {name = "call"} : (memref<f32>) -> ()
+  memref.load %arg0[] {name = "post"} : memref<f32>
+  return
+}

From 0c23163184e098e1aac128cbbd7b4c2b4bd6bb26 Mon Sep 17 00:00:00 2001
From: Mitch Phillips <mitchp@google.com>
Date: Thu, 4 Jan 2024 10:37:32 +0100
Subject: [PATCH 209/313] Revert "[mlir] Add `res()` method to
 `linalg::ContractionOpInterface` (#76539)"

This reverts commit 53edf12e526704cc251b6a6917319c7cb7a653a0.

Reason: Broke the sanitizer buildbots with a memory leak. More
information available on
https://github.com/llvm/llvm-project/commit/53edf12e526704cc251b6a6917319c7cb7a653a0
---
 .../Dialect/Linalg/IR/LinalgInterfaces.td     |  8 ----
 mlir/unittests/Dialect/CMakeLists.txt         |  1 -
 mlir/unittests/Dialect/Linalg/CMakeLists.txt  |  8 ----
 .../Dialect/Linalg/LinalgInterfacesTest.cpp   | 43 -------------------
 4 files changed, 60 deletions(-)
 delete mode 100644 mlir/unittests/Dialect/Linalg/CMakeLists.txt
 delete mode 100644 mlir/unittests/Dialect/Linalg/LinalgInterfacesTest.cpp

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index 777d7cfd558d2..fbf3f19cde0e9 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -54,14 +54,6 @@ def LinalgContractionOpInterface : OpInterface<"ContractionOpInterface"> {
       return $_op.getOperation()->getOperand(1);
     }]>,
     InterfaceMethod<
-    /*desc=*/"Returns the result value.",
-    /*retTy=*/"OpResult",
-    /*methodName=*/"res",
-    /*args=*/(ins),
-    /*methodBody=*/[{
-      return $_op.getOperation()->getResult(0);
-    }]>,
-    InterfaceMethod<
     /*desc=*/[{
       Returns whether the given op has indexing maps that correspond to a
       row-major matmul operation.
diff --git a/mlir/unittests/Dialect/CMakeLists.txt b/mlir/unittests/Dialect/CMakeLists.txt
index 76b698d1d1a7b..2dec4ba3c001e 100644
--- a/mlir/unittests/Dialect/CMakeLists.txt
+++ b/mlir/unittests/Dialect/CMakeLists.txt
@@ -8,7 +8,6 @@ target_link_libraries(MLIRDialectTests
 
 add_subdirectory(ArmSME)
 add_subdirectory(Index)
-add_subdirectory(Linalg)
 add_subdirectory(LLVMIR)
 add_subdirectory(MemRef)
 add_subdirectory(SCF)
diff --git a/mlir/unittests/Dialect/Linalg/CMakeLists.txt b/mlir/unittests/Dialect/Linalg/CMakeLists.txt
deleted file mode 100644
index 080caab8d075e..0000000000000
--- a/mlir/unittests/Dialect/Linalg/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-add_mlir_unittest(MLIRLinalgTests
-  LinalgInterfacesTest.cpp
-)
-target_link_libraries(MLIRLinalgTests
-  PRIVATE
-  MLIRLinalgDialect
-  )
-
diff --git a/mlir/unittests/Dialect/Linalg/LinalgInterfacesTest.cpp b/mlir/unittests/Dialect/Linalg/LinalgInterfacesTest.cpp
deleted file mode 100644
index 8cc4a5e37c452..0000000000000
--- a/mlir/unittests/Dialect/Linalg/LinalgInterfacesTest.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===- LinalgInterfacesTest.cpp - LinalgInterfaces unit tests ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-
-#include "gtest/gtest.h"
-
-using namespace mlir;
-
-class LinalgInterfacesTest : public ::testing::Test {
-protected:
-  LinalgInterfacesTest() {
-    context.getOrLoadDialect<mlir::linalg::LinalgDialect>();
-  }
-
-  mlir::MLIRContext context;
-};
-
-TEST_F(LinalgInterfacesTest, ContractionOpOperandResultAccessor) {
-  OpBuilder b(&context);
-  SmallVector<int64_t> lhsShape = {1, 2};
-  SmallVector<int64_t> rhsShape = {2, 4};
-  SmallVector<int64_t> resShape = {1, 4};
-  auto lhs = b.create<tensor::EmptyOp>(UnknownLoc::get(&context), lhsShape,
-                                       b.getF32Type());
-  auto rhs = b.create<tensor::EmptyOp>(UnknownLoc::get(&context), rhsShape,
-                                       b.getF32Type());
-  auto out = b.create<tensor::EmptyOp>(UnknownLoc::get(&context), resShape,
-                                       b.getF32Type());
-  Operation *op = b.create<linalg::MatmulOp>(
-      UnknownLoc::get(&context), ValueRange{lhs, rhs}, ValueRange{out});
-  auto contractOp = llvm::cast<linalg::ContractionOpInterface>(op);
-
-  EXPECT_EQ(contractOp.lhs(), op->getOperand(0));
-  EXPECT_EQ(contractOp.rhs(), op->getOperand(1));
-  EXPECT_EQ(contractOp.res(), op->getResult(0));
-}

From b4ac4d2264123ab2672a9efed99068df8fb750aa Mon Sep 17 00:00:00 2001
From: sstipanovic <146831748+sstipanovic@users.noreply.github.com>
Date: Thu, 4 Jan 2024 10:38:18 +0100
Subject: [PATCH 210/313] [NFC][AMDGPU] Move image-atomic-attributes test to
 test/Assembler. (#76917)

---
 .../amdgpu-image-atomic-attributes.ll         | 191 ++++++++++
 .../CodeGen/AMDGPU/image-atomic-attributes.ll | 356 ------------------
 2 files changed, 191 insertions(+), 356 deletions(-)
 create mode 100644 llvm/test/Assembler/amdgpu-image-atomic-attributes.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll

diff --git a/llvm/test/Assembler/amdgpu-image-atomic-attributes.ll b/llvm/test/Assembler/amdgpu-image-atomic-attributes.ll
new file mode 100644
index 0000000000000..bc03908d5f218
--- /dev/null
+++ b/llvm/test/Assembler/amdgpu-image-atomic-attributes.ll
@@ -0,0 +1,191 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown < %s | FileCheck %s
+
+define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps <2 x float> @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) {
+main_body:
+  %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i64 %v to <2 x float>
+  ret <2 x float> %out
+}
+
+define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps <2 x float> @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) {
+main_body:
+  %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i64 %v to <2 x float>
+  ret <2 x float> %out
+}
+
+define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64, i32, <8 x i32>, i32, i32) #0
+declare i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64, i64, i32, <8 x i32>, i32, i32) #0
+
+declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn }
+;.
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll b/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll
deleted file mode 100644
index dd60d9d702716..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll
+++ /dev/null
@@ -1,356 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefixes=CHECK %s
-
-define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_swap_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps <2 x float> @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps <2 x float> @atomic_swap_1d_i64(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i64 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i64 [[V]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[OUT]]
-;
-main_body:
-  %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i64 %v to <2 x float>
-  ret <2 x float> %out
-}
-
-define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_sub_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_smin_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_umin_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_smax_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_umax_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_and_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_or_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_xor_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_inc_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_dec_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_cmpswap_1d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[CMP:%.*]], i32 [[SWAP:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 [[CMP]], i32 [[SWAP]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps <2 x float> @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps <2 x float> @atomic_cmpswap_1d_64(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i64 [[CMP:%.*]], i64 [[SWAP:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 [[CMP]], i64 [[SWAP]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i64 [[V]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[OUT]]
-;
-main_body:
-  %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i64 %v to <2 x float>
-  ret <2 x float> %out
-}
-
-define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_2d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_3d(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[R:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[R]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_cube(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FACE:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[FACE]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_1darray(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[SLICE:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[SLICE]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_2darray(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[SLICE]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_2dmsaa(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_2darraymsaa(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]], i32 [[FRAGID:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
-; CHECK-LABEL: define amdgpu_ps float @atomic_add_1d_slc(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 2)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
-; CHECK-NEXT:    ret float [[OUT]]
-;
-main_body:
-  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
-  %out = bitcast i32 %v to float
-  ret float %out
-}
-
-declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
-
-declare i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64, i32, <8 x i32>, i32, i32) #0
-declare i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64, i64, i32, <8 x i32>, i32, i32) #0
-
-declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind readnone }
-;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn }
-;.

From 26993f61673e3d9b29785f9baa5bac50c09f8bcf Mon Sep 17 00:00:00 2001
From: Mitch Phillips <mitchp@google.com>
Date: Thu, 4 Jan 2024 10:47:51 +0100
Subject: [PATCH 211/313] Revert "[clang-format] Optimize processing
 .clang-format-ignore files (#76733)"

This reverts commit 42ec976184acd40436acd7104ad715c60ca3e7ed.

Reason: Broke the sanitizer buildbots. See more information on the
github comment thread at
https://github.com/llvm/llvm-project/commit/42ec976184acd40436acd7104ad715c60ca3e7ed
---
 clang/docs/ClangFormat.rst                |  6 +--
 clang/test/Format/clang-format-ignore.cpp | 25 +++------
 clang/tools/clang-format/ClangFormat.cpp  | 65 +++++++----------------
 3 files changed, 27 insertions(+), 69 deletions(-)

diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index 819d9ee9f9cde..8d4017b29fb8e 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -131,9 +131,6 @@ An easy way to create the ``.clang-format`` file is:
 
 Available style options are described in :doc:`ClangFormatStyleOptions`.
 
-.clang-format-ignore
-====================
-
 You can create ``.clang-format-ignore`` files to make ``clang-format`` ignore
 certain files. A ``.clang-format-ignore`` file consists of patterns of file path
 names. It has the following format:
@@ -144,8 +141,7 @@ names. It has the following format:
 * A non-comment line is a single pattern.
 * The slash (``/``) is used as the directory separator.
 * A pattern is relative to the directory of the ``.clang-format-ignore`` file
-  (or the root directory if the pattern starts with a slash). Patterns
-  containing drive names (e.g. ``C:``) are not supported.
+  (or the root directory if the pattern starts with a slash).
 * Patterns follow the rules specified in `POSIX 2.13.1, 2.13.2, and Rule 1 of
   2.13.3 <https://pubs.opengroup.org/onlinepubs/9699919799/utilities/
   V3_chap02.html#tag_18_13>`_.
diff --git a/clang/test/Format/clang-format-ignore.cpp b/clang/test/Format/clang-format-ignore.cpp
index 5a2267b302d22..0d6396a64a668 100644
--- a/clang/test/Format/clang-format-ignore.cpp
+++ b/clang/test/Format/clang-format-ignore.cpp
@@ -21,26 +21,13 @@
 
 // RUN: touch .clang-format-ignore
 // RUN: clang-format -verbose foo.c foo.js 2> %t.stderr
-// RUN: grep -Fx "Formatting [1/2] foo.c" %t.stderr
-// RUN: grep -Fx "Formatting [2/2] foo.js" %t.stderr
+// RUN: grep "Formatting \[1/2] foo.c" %t.stderr
+// RUN: grep "Formatting \[2/2] foo.js" %t.stderr
 
 // RUN: echo "*.js" > .clang-format-ignore
 // RUN: clang-format -verbose foo.c foo.js 2> %t.stderr
-// RUN: grep -Fx "Formatting [1/2] foo.c" %t.stderr
-// RUN: not grep -F foo.js %t.stderr
+// RUN: grep "Formatting \[1/2] foo.c" %t.stderr
+// RUN: not grep "Formatting \[2/2] foo.js" %t.stderr
 
-// RUN: cd ../..
-// RUN: clang-format -verbose *.cc level1/*.c* level1/level2/foo.* 2> %t.stderr
-// RUN: grep -x "Formatting \[1/5] .*foo\.c" %t.stderr
-// RUN: not grep -F foo.js %t.stderr
-
-// RUN: rm .clang-format-ignore
-// RUN: clang-format -verbose *.cc level1/*.c* level1/level2/foo.* 2> %t.stderr
-// RUN: grep -x "Formatting \[1/5] .*foo\.cc" %t.stderr
-// RUN: grep -x "Formatting \[2/5] .*bar\.cc" %t.stderr
-// RUN: grep -x "Formatting \[3/5] .*baz\.c" %t.stderr
-// RUN: grep -x "Formatting \[4/5] .*foo\.c" %t.stderr
-// RUN: not grep -F foo.js %t.stderr
-
-// RUN: cd ..
-// RUN: rm -r %t.dir
+// RUN: cd ../../..
+// RUN: rm -rf %t.dir
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 787e56a6eccc0..be34dbbe886a1 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -571,11 +571,6 @@ static int dumpConfig(bool IsSTDIN) {
   return 0;
 }
 
-using String = SmallString<128>;
-static String IgnoreDir;             // Directory of .clang-format-ignore file.
-static StringRef PrevDir;            // Directory of previous `FilePath`.
-static SmallVector<String> Patterns; // Patterns in .clang-format-ignore file.
-
 // Check whether `FilePath` is ignored according to the nearest
 // .clang-format-ignore file based on the rules below:
 // - A blank line is skipped.
@@ -591,50 +586,33 @@ static bool isIgnored(StringRef FilePath) {
   if (!is_regular_file(FilePath))
     return false;
 
-  String Path;
-  String AbsPath{FilePath};
-
   using namespace llvm::sys::path;
+  SmallString<128> Path, AbsPath{FilePath};
+
   make_absolute(AbsPath);
   remove_dots(AbsPath, /*remove_dot_dot=*/true);
 
-  if (StringRef Dir{parent_path(AbsPath)}; PrevDir != Dir) {
-    PrevDir = Dir;
-
-    for (;;) {
-      Path = Dir;
-      append(Path, ".clang-format-ignore");
-      if (is_regular_file(Path))
-        break;
-      Dir = parent_path(Dir);
-      if (Dir.empty())
-        return false;
-    }
-
-    IgnoreDir = convert_to_slash(Dir);
-
-    std::ifstream IgnoreFile{Path.c_str()};
-    if (!IgnoreFile.good())
+  StringRef IgnoreDir{AbsPath};
+  do {
+    IgnoreDir = parent_path(IgnoreDir);
+    if (IgnoreDir.empty())
       return false;
 
-    Patterns.clear();
+    Path = IgnoreDir;
+    append(Path, ".clang-format-ignore");
+  } while (!is_regular_file(Path));
 
-    for (std::string Line; std::getline(IgnoreFile, Line);) {
-      if (const auto Pattern{StringRef{Line}.trim()};
-          // Skip empty and comment lines.
-          !Pattern.empty() && Pattern[0] != '#') {
-        Patterns.push_back(Pattern);
-      }
-    }
-  }
-
-  if (IgnoreDir.empty())
+  std::ifstream IgnoreFile{Path.c_str()};
+  if (!IgnoreFile.good())
     return false;
 
-  const auto Pathname{convert_to_slash(AbsPath)};
-  for (const auto &Pat : Patterns) {
-    const bool IsNegated = Pat[0] == '!';
-    StringRef Pattern{Pat};
+  const auto Pathname = convert_to_slash(AbsPath);
+  for (std::string Line; std::getline(IgnoreFile, Line);) {
+    auto Pattern = StringRef(Line).trim();
+    if (Pattern.empty() || Pattern[0] == '#')
+      continue;
+
+    const bool IsNegated = Pattern[0] == '!';
     if (IsNegated)
       Pattern = Pattern.drop_front();
 
@@ -642,14 +620,11 @@ static bool isIgnored(StringRef FilePath) {
       continue;
 
     Pattern = Pattern.ltrim();
-
-    // `Pattern` is relative to `IgnoreDir` unless it starts with a slash.
-    // This doesn't support patterns containing drive names (e.g. `C:`).
     if (Pattern[0] != '/') {
-      Path = IgnoreDir;
+      Path = convert_to_slash(IgnoreDir);
       append(Path, Style::posix, Pattern);
       remove_dots(Path, /*remove_dot_dot=*/true, Style::posix);
-      Pattern = Path;
+      Pattern = Path.str();
     }
 
     if (clang::format::matchFilePath(Pattern, Pathname) == !IsNegated)

From 985bb3a20a788b3cda3256084fbdef20296ba8cb Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Thu, 4 Jan 2024 09:45:30 +0000
Subject: [PATCH 212/313] [mlir] fix bytecode writer after c1eab57673ef3eb28

The change in c1eab57 fixed the
behavior of `getDiscardableAttrDictionary` for ops that are not using
properties to only return discardable attributes. Bytecode writer was
relying on the wrong behavior and would assume all attributes are
discardable, without appropriate testing. Fix that and add a test.
---
 mlir/lib/Bytecode/Writer/BytecodeWriter.cpp |  7 ++-
 mlir/lib/Bytecode/Writer/IRNumbering.cpp    |  7 +--
 mlir/unittests/Bytecode/BytecodeTest.cpp    | 59 +++++++++++++++++++++
 3 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
index 6097f0eda469c..9493a6c19a106 100644
--- a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
+++ b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
@@ -962,9 +962,12 @@ LogicalResult BytecodeWriter::writeOp(EncodingEmitter &emitter, Operation *op) {
   DictionaryAttr attrs = op->getDiscardableAttrDictionary();
   // Allow deployment to version <kNativePropertiesEncoding by merging inherent
   // attribute with the discardable ones. We should fail if there are any
-  // conflicts.
-  if (config.bytecodeVersion < bytecode::kNativePropertiesEncoding)
+  // conflicts. When properties are not used by the op, also store everything as
+  // attributes.
+  if (config.bytecodeVersion < bytecode::kNativePropertiesEncoding ||
+      !op->getPropertiesStorage()) {
     attrs = op->getAttrDictionary();
+  }
   if (!attrs.empty()) {
     opEncodingMask |= bytecode::OpEncodingMask::kHasAttrs;
     emitter.emitVarInt(numberingState.getNumber(attrs));
diff --git a/mlir/lib/Bytecode/Writer/IRNumbering.cpp b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
index 036a9477cce6c..a306010698f20 100644
--- a/mlir/lib/Bytecode/Writer/IRNumbering.cpp
+++ b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
@@ -425,9 +425,10 @@ void IRNumberingState::number(Operation &op) {
 
   // Only number the operation's dictionary if it isn't empty.
   DictionaryAttr dictAttr = op.getDiscardableAttrDictionary();
-  // Prior to version 5 we need to number also the merged dictionnary
-  // containing both the inherent and discardable attribute.
-  if (config.getDesiredBytecodeVersion() < 5)
+  // Prior to version 5, or when properties are not used, we need to number also
+  // the merged dictionary containing both the inherent and discardable
+  // attribute.
+  if (config.getDesiredBytecodeVersion() < 5 || !op.getPropertiesStorage())
     dictAttr = op.getAttrDictionary();
   if (!dictAttr.empty())
     number(dictAttr);
diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp
index db748b475a7ac..76ff1a8194db7 100644
--- a/mlir/unittests/Bytecode/BytecodeTest.cpp
+++ b/mlir/unittests/Bytecode/BytecodeTest.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Bytecode/BytecodeReader.h"
 #include "mlir/Bytecode/BytecodeWriter.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -15,6 +16,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -86,3 +88,60 @@ TEST(Bytecode, MultiModuleWithResource) {
   checkResourceAttribute(*module);
   checkResourceAttribute(*roundTripModule);
 }
+
+namespace {
+/// A custom operation for the purpose of showcasing how discardable attributes
+/// are handled in absence of properties.
+class OpWithoutProperties : public Op<OpWithoutProperties> {
+public:
+  // Begin boilerplate.
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OpWithoutProperties)
+  using Op::Op;
+  static ArrayRef<StringRef> getAttributeNames() {
+    static StringRef attributeNames[] = {StringRef("inherent_attr")};
+    return ArrayRef(attributeNames);
+  };
+  static StringRef getOperationName() {
+    return "test_op_properties.op_without_properties";
+  }
+  // End boilerplate.
+};
+
+// A trivial supporting dialect to register the above operation.
+class TestOpPropertiesDialect : public Dialect {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOpPropertiesDialect)
+  static constexpr StringLiteral getDialectNamespace() {
+    return StringLiteral("test_op_properties");
+  }
+  explicit TestOpPropertiesDialect(MLIRContext *context)
+      : Dialect(getDialectNamespace(), context,
+                TypeID::get<TestOpPropertiesDialect>()) {
+    addOperations<OpWithoutProperties>();
+  }
+};
+} // namespace
+
+constexpr StringLiteral withoutPropertiesAttrsSrc = R"mlir(
+    "test_op_properties.op_without_properties"()
+      {inherent_attr = 42, other_attr = 56} : () -> ()
+)mlir";
+
+TEST(Bytecode, OpWithoutProperties) {
+  MLIRContext context;
+  context.getOrLoadDialect<TestOpPropertiesDialect>();
+  ParserConfig config(&context);
+  OwningOpRef<Operation *> op =
+      parseSourceString(withoutPropertiesAttrsSrc, config);
+
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  ASSERT_TRUE(succeeded(writeBytecodeToFile(op.get(), os)));
+  std::unique_ptr<Block> block = std::make_unique<Block>();
+  ASSERT_TRUE(succeeded(readBytecodeFile(
+      llvm::MemoryBufferRef(os.str(), "string-buffer"), block.get(), config)));
+  Operation *roundtripped = &block->front();
+  EXPECT_EQ(roundtripped->getAttrs().size(), 2u);
+  EXPECT_TRUE(roundtripped->getInherentAttr("inherent_attr") != std::nullopt);
+  EXPECT_TRUE(roundtripped->getDiscardableAttr("other_attr") != Attribute());
+}

From 5ed11e767c0c39a3bc8e035588e7a383849d46a8 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Thu, 4 Jan 2024 09:48:29 +0000
Subject: [PATCH 213/313] [mlir] don't use magic numbers in IRNumbering.cpp

Bytecode versions have named constants that should be used instead of
magic numbers.
---
 mlir/lib/Bytecode/Writer/IRNumbering.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Bytecode/Writer/IRNumbering.cpp b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
index a306010698f20..f36c9ef060b6d 100644
--- a/mlir/lib/Bytecode/Writer/IRNumbering.cpp
+++ b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
@@ -9,6 +9,7 @@
 #include "IRNumbering.h"
 #include "mlir/Bytecode/BytecodeImplementation.h"
 #include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/Bytecode/Encoding.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
@@ -425,17 +426,20 @@ void IRNumberingState::number(Operation &op) {
 
   // Only number the operation's dictionary if it isn't empty.
   DictionaryAttr dictAttr = op.getDiscardableAttrDictionary();
-  // Prior to version 5, or when properties are not used, we need to number also
-  // the merged dictionary containing both the inherent and discardable
-  // attribute.
-  if (config.getDesiredBytecodeVersion() < 5 || !op.getPropertiesStorage())
+  // Prior to a version with native property encoding, or when properties are
+  // not used, we need to number also the merged dictionary containing both the
+  // inherent and discardable attribute.
+  if (config.getDesiredBytecodeVersion() <
+          bytecode::kNativePropertiesEncoding ||
+      !op.getPropertiesStorage()) {
     dictAttr = op.getAttrDictionary();
+  }
   if (!dictAttr.empty())
     number(dictAttr);
 
   // Visit the operation properties (if any) to make sure referenced attributes
   // are numbered.
-  if (config.getDesiredBytecodeVersion() >= 5 &&
+  if (config.getDesiredBytecodeVersion() >= bytecode::kNativePropertiesEncoding &&
       op.getPropertiesStorageSize()) {
     if (op.isRegistered()) {
       // Operation that have properties *must* implement this interface.

From ce61b0e9a41fb55beaef04610967ccc7b69307f8 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme@arm.com>
Date: Thu, 4 Jan 2024 10:15:16 +0000
Subject: [PATCH 214/313] Add out-of-line-atomics support to GlobalISel
 (#74588)

This patch implement the GlobalISel counterpart to
4d7df43ffdb460dddb2877a886f75f45c3fee188.
---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |    5 +
 llvm/include/llvm/CodeGen/RuntimeLibcalls.h   |    6 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  140 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   41 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   30 +-
 .../aarch64-atomic-load-outline_atomics.ll    |   48 +-
 .../aarch64-atomic-store-outline_atomics.ll   |   48 +-
 .../aarch64-atomicrmw-outline_atomics.ll      | 2380 ++++------
 .../aarch64-cmpxchg-outline_atomics.ll        | 1683 ++-----
 .../AArch64/GlobalISel/arm64-atomic-128.ll    |  239 +
 .../AArch64/GlobalISel/arm64-atomic.ll        | 3918 ++++++++++++++++-
 .../GlobalISel/legalizer-info-validation.mir  |    1 -
 12 files changed, 5594 insertions(+), 2945 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 9880e82dd5e15..d09100b28f3e8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -224,6 +224,11 @@ struct TypePairAndMemDesc {
   }
 };
 
+/// True iff P is false.
+template <typename Predicate> Predicate predNot(Predicate P) {
+  return [=](const LegalityQuery &Query) { return !P(Query); };
+}
+
 /// True iff P0 and P1 are true.
 template<typename Predicate>
 Predicate all(Predicate P0, Predicate P1) {
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
index 6664206815107..3a407c4a4d940 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -82,6 +82,12 @@ namespace RTLIB {
   /// UNKNOWN_LIBCALL if there is none.
   Libcall getSYNC(unsigned Opc, MVT VT);
 
+  /// Return the outline atomics value for the given atomic ordering, access
+  /// size and set of libcalls for a given atomic, or UNKNOWN_LIBCALL if there
+  /// is none.
+  Libcall getOutlineAtomicHelper(const Libcall (&LC)[5][4],
+                                 AtomicOrdering Order, uint64_t MemSize);
+
   /// Return the outline atomics value for the given opcode, atomic ordering
   /// and type, or UNKNOWN_LIBCALL if there is none.
   Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d2c41ebae5734..3de8fee1ac4e0 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -796,6 +797,132 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
   return LegalizerHelper::Legalized;
 }
 
+static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  auto &AtomicMI = cast<GMemOperation>(MI);
+  auto &MMO = AtomicMI.getMMO();
+  auto Ordering = MMO.getMergedOrdering();
+  LLT MemType = MMO.getMemoryType();
+  uint64_t MemSize = MemType.getSizeInBytes();
+  if (MemType.isVector())
+    return RTLIB::UNKNOWN_LIBCALL;
+
+#define LCALLS(A, B)                                                           \
+  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALL5(A)                                                              \
+  LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
+  switch (Opc) {
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
+  }
+  case TargetOpcode::G_ATOMICRMW_XCHG: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
+  }
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_SUB: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
+  }
+  case TargetOpcode::G_ATOMICRMW_AND: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
+  }
+  case TargetOpcode::G_ATOMICRMW_OR: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
+  }
+  case TargetOpcode::G_ATOMICRMW_XOR: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
+  }
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+#undef LCALLS
+#undef LCALL5
+}
+
+static LegalizerHelper::LegalizeResult
+createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
+  auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
+
+  Type *RetTy;
+  SmallVector<Register> RetRegs;
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    Register Success;
+    LLT SuccessLLT;
+    auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
+        MI.getFirst4RegLLTs();
+    RetRegs.push_back(Ret);
+    RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
+    if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
+      std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
+               NewLLT) = MI.getFirst5RegLLTs();
+      RetRegs.push_back(Success);
+      RetTy = StructType::get(
+          Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
+    }
+    Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
+    Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
+    Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
+    break;
+  }
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_SUB:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR: {
+    auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
+    RetRegs.push_back(Ret);
+    RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
+    if (Opc == TargetOpcode::G_ATOMICRMW_AND)
+      Val =
+          MIRBuilder.buildXor(ValLLT, MIRBuilder.buildConstant(ValLLT, -1), Val)
+              .getReg(0);
+    else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
+      Val =
+          MIRBuilder.buildSub(ValLLT, MIRBuilder.buildConstant(ValLLT, 0), Val)
+              .getReg(0);
+    Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
+    Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
+    break;
+  }
+  default:
+    llvm_unreachable("unsupported opcode");
+  }
+
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
+  const char *Name = TLI.getLibcallName(RTLibcall);
+
+  // Unsupported libcall on the target.
+  if (!Name) {
+    LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
+                      << MIRBuilder.getTII().getName(Opc) << "\n");
+    return LegalizerHelper::UnableToLegalize;
+  }
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
+
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
+    return LegalizerHelper::UnableToLegalize;
+
+  return LegalizerHelper::Legalized;
+}
+
 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
                                        Type *FromType) {
   auto ToMVT = MVT::getVT(ToType);
@@ -1081,6 +1208,19 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_SUB:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR:
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    auto Status = createAtomicLibcall(MIRBuilder, MI);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
   case TargetOpcode::G_BZERO:
   case TargetOpcode::G_MEMCPY:
   case TargetOpcode::G_MEMMOVE:
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 2648c16bcd8d9..acbbfd9ddaf52 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -520,27 +520,28 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) {
                       FREXP_PPCF128);
 }
 
-RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
-                                        MVT VT) {
+RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4],
+                                             AtomicOrdering Order,
+                                             uint64_t MemSize) {
   unsigned ModeN, ModelN;
-  switch (VT.SimpleTy) {
-  case MVT::i8:
+  switch (MemSize) {
+  case 1:
     ModeN = 0;
     break;
-  case MVT::i16:
+  case 2:
     ModeN = 1;
     break;
-  case MVT::i32:
+  case 4:
     ModeN = 2;
     break;
-  case MVT::i64:
+  case 8:
     ModeN = 3;
     break;
-  case MVT::i128:
+  case 16:
     ModeN = 4;
     break;
   default:
-    return UNKNOWN_LIBCALL;
+    return RTLIB::UNKNOWN_LIBCALL;
   }
 
   switch (Order) {
@@ -561,6 +562,16 @@ RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
     return UNKNOWN_LIBCALL;
   }
 
+  return LC[ModeN][ModelN];
+}
+
+RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
+                                        MVT VT) {
+  unsigned ModeN, ModelN;
+  if (!VT.isScalarInteger())
+    return UNKNOWN_LIBCALL;
+  uint64_t MemSize = VT.getScalarSizeInBits() / 8;
+
 #define LCALLS(A, B)                                                           \
   { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
 #define LCALL5(A)                                                              \
@@ -568,27 +579,27 @@ RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
   switch (Opc) {
   case ISD::ATOMIC_CMP_SWAP: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_CAS)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_SWAP: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_SWP)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_LOAD_ADD: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDADD)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_LOAD_OR: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDSET)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_LOAD_CLR: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDCLR)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_LOAD_XOR: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDEOR)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   default:
     return UNKNOWN_LIBCALL;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 149119c657272..4ba73736294fc 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -765,17 +765,35 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lowerIf(
           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
 
+  LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
+    return ST.outlineAtomics() && !ST.hasLSE();
+  };
+
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
-      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
-      .customIf([](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() == 128;
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
+                   predNot(UseOutlineAtomics)))
+      .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
+      .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
+        return Query.Types[0].getSizeInBits() == 128 &&
+               !UseOutlineAtomics(Query);
       })
+      .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
+                     UseOutlineAtomics))
+      .clampScalar(0, s32, s64);
+
+  getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
+                               G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
+                               G_ATOMICRMW_XOR})
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
+                   predNot(UseOutlineAtomics)))
+      .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
+                     UseOutlineAtomics))
       .clampScalar(0, s32, s64);
 
+  // Do not outline these atomics operations, as per comment in
+  // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
   getActionDefinitionsBuilder(
-      {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
-       G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
-       G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
+      {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
       .clampScalar(0, s32, s64);
 
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
index fb4bef33d9b4f..fccafb29addbc 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
@@ -229,11 +229,7 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_unordered:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_unordered:
 ; -O1:    ldxp x0, x1, [x8]
@@ -244,11 +240,7 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_unordered_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_unordered_const:
 ; -O1:    ldxp x0, x1, [x8]
@@ -259,11 +251,7 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
@@ -274,11 +262,7 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_monotonic_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const:
 ; -O1:    ldxp x0, x1, [x8]
@@ -289,11 +273,7 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_acquire:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -304,11 +284,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_acquire_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_acquire_const:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -319,11 +295,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr)
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq_rel
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -334,11 +306,7 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq_rel
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const:
 ; -O1:    ldaxp x0, x1, [x8]
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
index 3d204b734d4a0..e594561010464 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
@@ -117,14 +117,10 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_unordered:
@@ -136,14 +132,10 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_monotonic:
@@ -155,14 +147,10 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_release:
@@ -174,14 +162,10 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
index c660c139e35d4..e9b096e8c6c44 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
@@ -145,14 +145,10 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
@@ -164,14 +160,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire:
@@ -183,14 +175,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release:
@@ -202,14 +190,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
@@ -221,14 +205,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
@@ -555,16 +535,12 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic:
@@ -577,16 +553,12 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire:
@@ -599,16 +571,12 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_release:
@@ -621,16 +589,12 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel:
@@ -643,16 +607,12 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst:
@@ -1170,15 +1130,11 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
-; -O0:    subs x14, x8, x10
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic:
@@ -1191,15 +1147,11 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
-; -O0:    subs x14, x8, x10
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire:
@@ -1212,15 +1164,11 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
-; -O0:    subs x14, x8, x10
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_release:
@@ -1233,15 +1181,11 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
-; -O0:    subs x14, x8, x10
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
@@ -1254,15 +1198,11 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
-; -O0:    subs x14, x8, x10
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
@@ -1575,7 +1515,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_and_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_monotonic:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_monotonic:
@@ -1587,7 +1527,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_acquire:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_acquire:
@@ -1599,7 +1539,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_release:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_release:
@@ -1611,7 +1551,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_acq_rel:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_acq_rel:
@@ -1623,7 +1563,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_seq_cst:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_seq_cst:
@@ -1635,7 +1575,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_and_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_monotonic:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_monotonic:
@@ -1647,7 +1587,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_and_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_acquire:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_acquire:
@@ -1659,7 +1599,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_and_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_release:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_release:
@@ -1671,7 +1611,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_and_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_acq_rel:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_acq_rel:
@@ -1683,7 +1623,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_and_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_seq_cst:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_seq_cst:
@@ -1695,7 +1635,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_and_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_monotonic:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_monotonic:
@@ -1707,7 +1647,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_and_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_acquire:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_acquire:
@@ -1719,7 +1659,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_and_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_release:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_release:
@@ -1731,7 +1671,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_and_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_acq_rel:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_acq_rel:
@@ -1743,7 +1683,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_and_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_seq_cst:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_seq_cst:
@@ -1755,7 +1695,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_monotonic:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic:
@@ -1767,7 +1707,7 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_acquire:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire:
@@ -1779,7 +1719,7 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_release:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_release:
@@ -1791,7 +1731,7 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_acq_rel:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel:
@@ -1803,7 +1743,7 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_seq_cst:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst:
@@ -1815,16 +1755,12 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
@@ -1838,16 +1774,12 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
@@ -1861,16 +1793,12 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_release:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
@@ -1884,16 +1812,12 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
@@ -1907,16 +1831,12 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
@@ -1930,7 +1850,7 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_monotonic:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_monotonic:
@@ -1942,7 +1862,7 @@ define dso_local i8 @atomicrmw_and_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_acquire:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_acquire:
@@ -1954,7 +1874,7 @@ define dso_local i8 @atomicrmw_and_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_release:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_release:
@@ -1966,7 +1886,7 @@ define dso_local i8 @atomicrmw_and_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_acq_rel:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_acq_rel:
@@ -1978,7 +1898,7 @@ define dso_local i8 @atomicrmw_and_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_seq_cst:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_seq_cst:
@@ -2245,13 +2165,11 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_monotonic:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_monotonic:
 ; -O1:    ldxrb w8, [x0]
@@ -2264,13 +2182,11 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_acquire:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_acquire:
 ; -O1:    ldaxrb w8, [x0]
@@ -2283,13 +2199,11 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_release:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_release:
 ; -O1:    ldxrb w8, [x0]
@@ -2302,13 +2216,11 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_acq_rel:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_acq_rel:
 ; -O1:    ldaxrb w8, [x0]
@@ -2321,13 +2233,11 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_seq_cst:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_seq_cst:
 ; -O1:    ldaxrb w8, [x0]
@@ -2340,12 +2250,10 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_monotonic:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_monotonic:
 ; -O1:    ldxrh w8, [x0]
@@ -2358,12 +2266,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_acquire:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_acquire:
 ; -O1:    ldaxrh w8, [x0]
@@ -2376,12 +2282,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_release:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_release:
 ; -O1:    ldxrh w8, [x0]
@@ -2394,12 +2298,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_acq_rel:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_acq_rel:
 ; -O1:    ldaxrh w8, [x0]
@@ -2412,12 +2314,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_seq_cst:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_seq_cst:
 ; -O1:    ldaxrh w8, [x0]
@@ -2430,12 +2330,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_monotonic:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -2448,12 +2346,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_acquire:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -2466,12 +2362,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_release:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -2484,12 +2378,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_acq_rel:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -2502,12 +2394,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_seq_cst:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -2520,12 +2410,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_monotonic:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -2538,12 +2426,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_acquire:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -2556,12 +2442,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_release:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -2574,12 +2458,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_acq_rel:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -2592,12 +2474,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_seq_cst:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -2612,16 +2492,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
@@ -2639,16 +2515,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
@@ -2666,16 +2538,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
@@ -2693,16 +2561,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
@@ -2720,16 +2584,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
@@ -2745,13 +2605,11 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_monotonic:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_monotonic:
 ; -O1:    ldxrb w8, [x0]
@@ -2764,13 +2622,11 @@ define dso_local i8 @atomicrmw_nand_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_acquire:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_acquire:
 ; -O1:    ldaxrb w8, [x0]
@@ -2783,13 +2639,11 @@ define dso_local i8 @atomicrmw_nand_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_release:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_release:
 ; -O1:    ldxrb w8, [x0]
@@ -2802,13 +2656,11 @@ define dso_local i8 @atomicrmw_nand_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_acq_rel:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_acq_rel:
 ; -O1:    ldaxrb w8, [x0]
@@ -2821,13 +2673,11 @@ define dso_local i8 @atomicrmw_nand_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_seq_cst:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_seq_cst:
 ; -O1:    ldaxrb w8, [x0]
@@ -3285,16 +3135,12 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
@@ -3308,16 +3154,12 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
@@ -3331,16 +3173,12 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_release:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
@@ -3354,16 +3192,12 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
@@ -3377,16 +3211,12 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
@@ -3830,16 +3660,12 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
@@ -3853,16 +3679,12 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
@@ -3876,16 +3698,12 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_release:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
@@ -3899,16 +3717,12 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
@@ -3922,16 +3736,12 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
@@ -4235,14 +4045,12 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_monotonic:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_monotonic:
 ; -O1:    ldxrb w9, [x0]
@@ -4256,14 +4064,12 @@ define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acquire:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_acquire:
 ; -O1:    ldaxrb w9, [x0]
@@ -4277,14 +4083,12 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_release:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_release:
 ; -O1:    ldxrb w9, [x0]
@@ -4298,14 +4102,12 @@ define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acq_rel:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_acq_rel:
 ; -O1:    ldaxrb w9, [x0]
@@ -4319,14 +4121,12 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_seq_cst:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_seq_cst:
 ; -O1:    ldaxrb w9, [x0]
@@ -4340,13 +4140,11 @@ define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_monotonic:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_monotonic:
 ; -O1:    ldxrh w9, [x0]
@@ -4360,13 +4158,11 @@ define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acquire:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_acquire:
 ; -O1:    ldaxrh w9, [x0]
@@ -4380,13 +4176,11 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_release:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_release:
 ; -O1:    ldxrh w9, [x0]
@@ -4400,13 +4194,11 @@ define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acq_rel:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_acq_rel:
 ; -O1:    ldaxrh w9, [x0]
@@ -4420,13 +4212,11 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_seq_cst:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_seq_cst:
 ; -O1:    ldaxrh w9, [x0]
@@ -4440,12 +4230,10 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_monotonic:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -4458,12 +4246,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acquire:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -4476,12 +4262,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_release:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -4494,12 +4278,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acq_rel:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -4512,12 +4294,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_seq_cst:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -4530,12 +4310,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_monotonic:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -4548,12 +4326,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acquire:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -4566,12 +4342,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_release:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -4584,12 +4358,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acq_rel:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -4602,12 +4374,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_seq_cst:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -4621,21 +4391,17 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic:
@@ -4651,21 +4417,17 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire:
@@ -4681,21 +4443,17 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_release:
@@ -4711,21 +4469,17 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel:
@@ -4741,21 +4495,17 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst:
@@ -4770,14 +4520,12 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_monotonic:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_monotonic:
 ; -O1:    ldxrb w9, [x0]
@@ -4791,14 +4539,12 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acquire:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_acquire:
 ; -O1:    ldaxrb w9, [x0]
@@ -4812,14 +4558,12 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_release:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_release:
 ; -O1:    ldxrb w9, [x0]
@@ -4833,14 +4577,12 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
 ; -O1:    ldaxrb w9, [x0]
@@ -4854,14 +4596,12 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
 ; -O1:    ldaxrb w9, [x0]
@@ -5205,14 +4945,12 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_monotonic:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_monotonic:
 ; -O1:    ldxrb w9, [x0]
@@ -5226,14 +4964,12 @@ define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acquire:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_acquire:
 ; -O1:    ldaxrb w9, [x0]
@@ -5247,14 +4983,12 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_release:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_release:
 ; -O1:    ldxrb w9, [x0]
@@ -5268,14 +5002,12 @@ define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acq_rel:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_acq_rel:
 ; -O1:    ldaxrb w9, [x0]
@@ -5289,14 +5021,12 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_seq_cst:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_seq_cst:
 ; -O1:    ldaxrb w9, [x0]
@@ -5310,13 +5040,11 @@ define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_monotonic:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_monotonic:
 ; -O1:    ldxrh w9, [x0]
@@ -5330,13 +5058,11 @@ define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acquire:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_acquire:
 ; -O1:    ldaxrh w9, [x0]
@@ -5350,13 +5076,11 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_release:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_release:
 ; -O1:    ldxrh w9, [x0]
@@ -5370,13 +5094,11 @@ define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acq_rel:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_acq_rel:
 ; -O1:    ldaxrh w9, [x0]
@@ -5390,13 +5112,11 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_seq_cst:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_seq_cst:
 ; -O1:    ldaxrh w9, [x0]
@@ -5410,12 +5130,10 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_monotonic:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -5428,12 +5146,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acquire:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -5446,12 +5162,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_release:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -5464,12 +5178,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acq_rel:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -5482,12 +5194,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_seq_cst:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -5500,12 +5210,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_monotonic:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -5518,12 +5226,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acquire:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -5536,12 +5242,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_release:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -5554,12 +5258,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acq_rel:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -5572,12 +5274,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_seq_cst:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -5591,21 +5291,17 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic:
@@ -5621,21 +5317,17 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire:
@@ -5651,21 +5343,17 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_release:
@@ -5681,21 +5369,17 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel:
@@ -5711,21 +5395,17 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst:
@@ -5740,14 +5420,12 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_monotonic:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_monotonic:
 ; -O1:    ldxrb w9, [x0]
@@ -5761,14 +5439,12 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acquire:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_acquire:
 ; -O1:    ldaxrb w9, [x0]
@@ -5782,14 +5458,12 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_release:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_release:
 ; -O1:    ldxrb w9, [x0]
@@ -5803,14 +5477,12 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
 ; -O1:    ldaxrb w9, [x0]
@@ -5824,14 +5496,12 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
 ; -O1:    ldaxrb w9, [x0]
@@ -6175,14 +5845,12 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_monotonic:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_monotonic:
 ; -O1:    and w9, w1, #0xff
@@ -6196,14 +5864,12 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acquire:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_acquire:
 ; -O1:    and w9, w1, #0xff
@@ -6217,14 +5883,12 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_release:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_release:
 ; -O1:    and w9, w1, #0xff
@@ -6238,14 +5902,12 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
 ; -O1:    and w9, w1, #0xff
@@ -6259,14 +5921,12 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
 ; -O1:    and w9, w1, #0xff
@@ -6280,12 +5940,10 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_monotonic:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_monotonic:
 ; -O1:    and w9, w1, #0xffff
@@ -6299,12 +5957,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acquire:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_acquire:
 ; -O1:    and w9, w1, #0xffff
@@ -6318,12 +5974,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_release:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_release:
 ; -O1:    and w9, w1, #0xffff
@@ -6337,12 +5991,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
 ; -O1:    and w9, w1, #0xffff
@@ -6356,12 +6008,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
 ; -O1:    and w9, w1, #0xffff
@@ -6375,12 +6025,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_monotonic:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -6393,12 +6041,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acquire:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -6411,12 +6057,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_release:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -6429,12 +6073,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -6447,12 +6089,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -6465,12 +6105,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_monotonic:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -6483,12 +6121,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acquire:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -6501,12 +6137,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_release:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -6519,12 +6153,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -6537,12 +6169,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -6556,21 +6186,17 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic:
@@ -6586,21 +6212,17 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire:
@@ -6616,21 +6238,17 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_release:
@@ -6646,21 +6264,17 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
@@ -6676,21 +6290,17 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
@@ -6705,14 +6315,12 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
 ; -O1:    and w9, w1, #0xff
@@ -6726,14 +6334,12 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acquire:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_acquire:
 ; -O1:    and w9, w1, #0xff
@@ -6747,14 +6353,12 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_release:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_release:
 ; -O1:    and w9, w1, #0xff
@@ -6768,14 +6372,12 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
 ; -O1:    and w9, w1, #0xff
@@ -6789,14 +6391,12 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
 ; -O1:    and w9, w1, #0xff
@@ -7135,14 +6735,12 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_monotonic:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_monotonic:
 ; -O1:    and w9, w1, #0xff
@@ -7156,14 +6754,12 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acquire:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_acquire:
 ; -O1:    and w9, w1, #0xff
@@ -7177,14 +6773,12 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_release:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_release:
 ; -O1:    and w9, w1, #0xff
@@ -7198,14 +6792,12 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
 ; -O1:    and w9, w1, #0xff
@@ -7219,14 +6811,12 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
 ; -O1:    and w9, w1, #0xff
@@ -7240,12 +6830,10 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_monotonic:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_monotonic:
 ; -O1:    and w9, w1, #0xffff
@@ -7259,12 +6847,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acquire:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_acquire:
 ; -O1:    and w9, w1, #0xffff
@@ -7278,12 +6864,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_release:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_release:
 ; -O1:    and w9, w1, #0xffff
@@ -7297,12 +6881,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
 ; -O1:    and w9, w1, #0xffff
@@ -7316,12 +6898,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
 ; -O1:    and w9, w1, #0xffff
@@ -7335,12 +6915,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_monotonic:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -7353,12 +6931,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acquire:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -7371,12 +6947,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_release:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -7389,12 +6963,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -7407,12 +6979,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -7425,12 +6995,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_monotonic:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -7443,12 +7011,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acquire:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -7461,12 +7027,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_release:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -7479,12 +7043,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -7497,12 +7059,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -7516,21 +7076,17 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic:
@@ -7546,21 +7102,17 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire:
@@ -7576,21 +7128,17 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_release:
@@ -7606,21 +7154,17 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
@@ -7636,21 +7180,17 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
@@ -7665,14 +7205,12 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
 ; -O1:    and w9, w1, #0xff
@@ -7686,14 +7224,12 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acquire:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_acquire:
 ; -O1:    and w9, w1, #0xff
@@ -7707,14 +7243,12 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_release:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_release:
 ; -O1:    and w9, w1, #0xff
@@ -7728,14 +7262,12 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
 ; -O1:    and w9, w1, #0xff
@@ -7749,14 +7281,12 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
 ; -O1:    and w9, w1, #0xff
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll
index 403e4770e17f9..86c040cc35935 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll
@@ -4,2400 +4,1440 @@
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+outline-atomics -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas1_relax
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas1_relax
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas1_relax
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_relax
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas1_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas1_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas1_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas2_relax
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas2_relax
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new monotonic monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas2_relax
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_relax
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new monotonic monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new monotonic acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new monotonic acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new monotonic seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new monotonic seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acquire monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acquire monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acquire acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acquire acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acquire seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acquire seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas2_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas2_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new release monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas2_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new release monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_acquire:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new release acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new release acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new release seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new release seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acq_rel monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acq_rel monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acq_rel acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acq_rel acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acq_rel seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acq_rel seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new seq_cst monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new seq_cst monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new seq_cst acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new seq_cst acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new seq_cst seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new seq_cst seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas4_relax
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas4_relax
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new monotonic monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas4_relax
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_relax
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new monotonic monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new monotonic acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new monotonic acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new monotonic seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new monotonic seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acquire monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acquire monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acquire acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acquire acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acquire seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acquire seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas4_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas4_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new release monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas4_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new release monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_acquire:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new release acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new release acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new release seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new release seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acq_rel monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acq_rel monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acq_rel acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acq_rel acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acq_rel seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acq_rel seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new seq_cst monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new seq_cst monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new seq_cst acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new seq_cst acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new seq_cst seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new seq_cst seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas8_relax
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas8_relax
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new monotonic monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas8_relax
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_relax
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new monotonic monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new monotonic acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new monotonic acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new monotonic seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new monotonic seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acquire monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acquire monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acquire acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acquire acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acquire seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acquire seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas8_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas8_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new release monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas8_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new release monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_acquire:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new release acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new release acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new release seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new release seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acq_rel monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acq_rel monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acq_rel acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acq_rel acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acq_rel seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acq_rel seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new seq_cst monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new seq_cst monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new seq_cst acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new seq_cst acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new seq_cst seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new seq_cst seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_monotonic:
-; -O0:    ldxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas16_relax
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas16_relax
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new monotonic monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_monotonic_weak:
-; -O0:    ldxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas16_relax
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_relax
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new monotonic monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new monotonic acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new monotonic acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new monotonic seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new monotonic seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_monotonic:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acquire monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_monotonic_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acquire monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acquire acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acquire acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acquire seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acquire seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_monotonic:
-; -O0:    ldxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas16_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas16_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new release monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_monotonic_weak:
-; -O0:    ldxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas16_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new release monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_acquire:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new release acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new release acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new release seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new release seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acq_rel monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acq_rel monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acq_rel acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acq_rel acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acq_rel seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acq_rel seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new seq_cst monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new seq_cst monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new seq_cst acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new seq_cst acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new seq_cst seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new seq_cst seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas1_relax
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas1_relax
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas1_relax
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_relax
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_acquire:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_monotonic:
-; -O1:    bl __aarch64_cas1_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_monotonic:
+; CHECK:    bl __aarch64_cas1_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas1_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
@@ -3362,3 +2402,6 @@ define dso_local i128 @cmpxchg_i128_unaligned_seq_cst_seq_cst_weak(i128 %expecte
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; -O0: {{.*}}
+; -O1: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
index a3d8531f5c765..1fe63c9be8c62 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O1
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+outline-atomics -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-OUTLINE-LLSC-O1
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse,+outline-atomics -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O0
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+outline-atomics -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-OUTLINE-LLSC-O0
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse,+outline-atomics -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
 @var = global i128 0
 
 define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
@@ -28,6 +32,25 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O1-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w19, -8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x19, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -63,6 +86,29 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O0-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x8, x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x8
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x0]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    sub sp, sp, #16
@@ -113,6 +159,25 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
 ; CHECK-LLSC-O1-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_monotonic_seqcst:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w19, -8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x19, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_monotonic_seqcst:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -148,6 +213,29 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
 ; CHECK-LLSC-O0-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap_monotonic_seqcst:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x8, x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x8
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x0]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap_monotonic_seqcst:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    sub sp, sp, #16
@@ -198,6 +286,25 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
 ; CHECK-LLSC-O1-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_release_acquire:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w19, -8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x19, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_release_acquire:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -233,6 +340,29 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
 ; CHECK-LLSC-O0-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap_release_acquire:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x8, x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x8
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x0]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap_release_acquire:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    sub sp, sp, #16
@@ -283,6 +413,25 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
 ; CHECK-LLSC-O1-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_monotonic:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w19, -8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x19, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_monotonic:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -318,6 +467,29 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
 ; CHECK-LLSC-O0-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap_monotonic:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x8, x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x8
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x0]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap_monotonic:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    sub sp, sp, #16
@@ -358,6 +530,19 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
 ; CHECK-LLSC-O1-NEXT:    str q0, [x3]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: atomic_load_relaxed:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:  .LBB4_1: // %atomicrmw.start
+; CHECK-OUTLINE-LLSC-O1-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldxp x9, x8, [x2]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stxp w10, x9, x8, [x2]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    cbnz w10, .LBB4_1
+; CHECK-OUTLINE-LLSC-O1-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x9
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x3]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: atomic_load_relaxed:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    mov x0, xzr
@@ -392,6 +577,28 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
 ; CHECK-LLSC-O0-NEXT:    str q0, [x3]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: atomic_load_relaxed:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x4, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x3, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, xzr
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_relax
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x3]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: atomic_load_relaxed:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    mov x8, xzr
@@ -434,6 +641,21 @@ define i128 @val_compare_and_swap_return(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O1-NEXT:    mov x0, x8
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_return:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x6, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x6
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_return:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -465,6 +687,23 @@ define i128 @val_compare_and_swap_return(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O0-NEXT:  .LBB5_4:
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap_return:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap_return:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    mov x8, x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index d03647f8b294e..739332414c198 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-NOLSE,CHECK-NOLSE-O1
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+outline-atomics -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-OUTLINE,CHECK-OUTLINE-O1
 ; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -O0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-NOLSE,CHECK-NOLSE-O0
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+outline-atomics -global-isel -global-isel-abort=1 -O0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-OUTLINE,CHECK-OUTLINE-O0
 ; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -mcpu=apple-a13 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-LSE-O1
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+outline-atomics -global-isel -global-isel-abort=1 -mcpu=apple-a13 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-LSE-O1
 ; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -mcpu=apple-a13 -O0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-LSE-O0
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+outline-atomics -global-isel -global-isel-abort=1 -mcpu=apple-a13 -O0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-LSE-O0
 
 define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O1-LABEL: val_compare_and_swap:
@@ -24,6 +28,17 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -37,6 +52,19 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O0-NEXT:  LBB0_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    casa w1, w2, [x0]
@@ -75,6 +103,18 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_from_load:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    ldr w8, [x2]
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w8
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_from_load:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -89,6 +129,20 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) #0 {
 ; CHECK-NOLSE-O0-NEXT:  LBB1_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_from_load:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    mov x8, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w1, [x8]
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_from_load:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldr w8, [x2]
@@ -129,6 +183,17 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_rel:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_rel:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -142,6 +207,19 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O0-NEXT:  LBB2_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_rel:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_rel:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    casal w1, w2, [x0]
@@ -179,6 +257,17 @@ define i64 @val_compare_and_swap_64(ptr %p, i64 %cmp, i64 %new) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -192,6 +281,19 @@ define i64 @val_compare_and_swap_64(ptr %p, i64 %cmp, i64 %new) #0 {
 ; CHECK-NOLSE-O0-NEXT:  LBB3_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    cas x1, x2, [x0]
@@ -229,6 +331,17 @@ define i64 @val_compare_and_swap_64_monotonic_seqcst(ptr %p, i64 %cmp, i64 %new)
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_64_monotonic_seqcst:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_64_monotonic_seqcst:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -242,6 +355,19 @@ define i64 @val_compare_and_swap_64_monotonic_seqcst(ptr %p, i64 %cmp, i64 %new)
 ; CHECK-NOLSE-O0-NEXT:  LBB4_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_64_monotonic_seqcst:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_64_monotonic_seqcst:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    casal x1, x2, [x0]
@@ -279,6 +405,17 @@ define i64 @val_compare_and_swap_64_release_acquire(ptr %p, i64 %cmp, i64 %new)
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_64_release_acquire:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_64_release_acquire:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -292,6 +429,19 @@ define i64 @val_compare_and_swap_64_release_acquire(ptr %p, i64 %cmp, i64 %new)
 ; CHECK-NOLSE-O0-NEXT:  LBB5_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_64_release_acquire:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_64_release_acquire:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    casal x1, x2, [x0]
@@ -323,6 +473,19 @@ define i32 @fetch_and_nand(ptr %p) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: fetch_and_nand:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB6_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w9, w8, #0x7
+; CHECK-OUTLINE-O1-NEXT:    mvn w9, w9
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB6_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: fetch_and_nand:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -360,6 +523,35 @@ define i32 @fetch_and_nand(ptr %p) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: fetch_and_nand:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB6_1
+; CHECK-OUTLINE-O0-NEXT:  LBB6_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0x7
+; CHECK-OUTLINE-O0-NEXT:    mvn w1, w8
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB6_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB6_2
+; CHECK-OUTLINE-O0-NEXT:  LBB6_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: fetch_and_nand:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov x8, x0
@@ -418,6 +610,19 @@ define i64 @fetch_and_nand_64(ptr %p) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: fetch_and_nand_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB7_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and x9, x8, #0x7
+; CHECK-OUTLINE-O1-NEXT:    mvn x9, x9
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB7_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: fetch_and_nand_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -455,6 +660,35 @@ define i64 @fetch_and_nand_64(ptr %p) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: fetch_and_nand_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB7_1
+; CHECK-OUTLINE-O0-NEXT:  LBB7_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and x8, x0, #0x7
+; CHECK-OUTLINE-O0-NEXT:    mvn x1, x8
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB7_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB7_2
+; CHECK-OUTLINE-O0-NEXT:  LBB7_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: fetch_and_nand_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov x8, x0
@@ -513,6 +747,15 @@ define i32 @fetch_and_or(ptr %p) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: fetch_and_or:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-NEXT:    mov x1, x0
+; CHECK-OUTLINE-NEXT:    mov w0, #5 ; =0x5
+; CHECK-OUTLINE-NEXT:    bl ___aarch64_ldset4_acq_rel
+; CHECK-OUTLINE-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: fetch_and_or:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -578,6 +821,15 @@ define i64 @fetch_and_or_64(ptr %p) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: fetch_and_or_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, #7 ; =0x7
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: fetch_and_or_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -614,6 +866,16 @@ define i64 @fetch_and_or_64(ptr %p) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: fetch_and_or_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x0
+; CHECK-OUTLINE-O0-NEXT:    mov w8, #7 ; =0x7
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: fetch_and_or_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov w8, #7 ; =0x7
@@ -636,6 +898,11 @@ define void @acquire_fence() #0 {
 ; CHECK-NOLSE-NEXT:    dmb ishld
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: acquire_fence:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    dmb ishld
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: acquire_fence:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    dmb ishld
@@ -655,6 +922,11 @@ define void @release_fence() #0 {
 ; CHECK-NOLSE-NEXT:    dmb ish
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: release_fence:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    dmb ish
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: release_fence:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    dmb ish
@@ -674,6 +946,11 @@ define void @seq_cst_fence() #0 {
 ; CHECK-NOLSE-NEXT:    dmb ish
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: seq_cst_fence:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    dmb ish
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: seq_cst_fence:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    dmb ish
@@ -693,6 +970,11 @@ define i32 @atomic_load(ptr %p) #0 {
 ; CHECK-NOLSE-NEXT:    ldar w0, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: atomic_load:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar w0, [x0]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar w0, [x0]
@@ -719,6 +1001,18 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldrb w8, [x0, #4095]
+; CHECK-OUTLINE-O1-NEXT:    ldrb w9, [x0, w1, sxtw]
+; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldurb w10, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ldrb w9, [x11]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10
+; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldrb w9, [x0, #4095]
@@ -733,6 +1027,20 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9, uxtb
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_load_relaxed_8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldrb w9, [x0, #4095]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, w1, sxtw
+; CHECK-OUTLINE-O0-NEXT:    ldrb w8, [x8]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, #256
+; CHECK-OUTLINE-O0-NEXT:    ldrb w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    ldrb w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load_relaxed_8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldrb w8, [x0, #4095]
@@ -789,6 +1097,18 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldrh w8, [x0, #8190]
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x0, w1, sxtw #1]
+; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldurh w10, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x11]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10
+; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldrh w9, [x0, #8190]
@@ -803,6 +1123,20 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9, uxth
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_load_relaxed_16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldrh w9, [x0, #8190]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, w1, sxtw #1
+; CHECK-OUTLINE-O0-NEXT:    ldrh w8, [x8]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9, uxth
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, #256
+; CHECK-OUTLINE-O0-NEXT:    ldrh w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9, uxth
+; CHECK-OUTLINE-O0-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    ldrh w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9, uxth
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load_relaxed_16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldrh w8, [x0, #8190]
@@ -859,6 +1193,18 @@ define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldr w8, [x0, #16380]
+; CHECK-OUTLINE-O1-NEXT:    ldr w9, [x0, w1, sxtw #2]
+; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldur w10, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ldr w9, [x11]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10
+; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_32:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0, #16380]
@@ -871,6 +1217,18 @@ define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_load_relaxed_32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [x0, #16380]
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [x0, w1, sxtw #2]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    ldur w9, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load_relaxed_32:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldr w8, [x0, #16380]
@@ -925,6 +1283,18 @@ define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-NEXT:    add x0, x8, x9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldr x8, [x0, #32760]
+; CHECK-OUTLINE-O1-NEXT:    ldr x9, [x0, w1, sxtw #3]
+; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldur x10, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    add x8, x8, x9
+; CHECK-OUTLINE-O1-NEXT:    ldr x9, [x11]
+; CHECK-OUTLINE-O1-NEXT:    add x8, x8, x10
+; CHECK-OUTLINE-O1-NEXT:    add x0, x8, x9
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0, #32760]
@@ -937,6 +1307,18 @@ define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add x0, x8, x9
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_load_relaxed_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [x0, #32760]
+; CHECK-OUTLINE-O0-NEXT:    ldr x9, [x0, w1, sxtw #3]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    ldur x9, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    ldr x9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add x0, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load_relaxed_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldr x8, [x0, #32760]
@@ -986,6 +1368,12 @@ define void @atomc_store(ptr %p) #0 {
 ; CHECK-NOLSE-NEXT:    stlr w8, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: atomc_store:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    mov w8, #4 ; =0x4
+; CHECK-OUTLINE-NEXT:    stlr w8, [x0]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomc_store:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov w8, #4 ; =0x4
@@ -1011,6 +1399,15 @@ define void @atomic_store_relaxed_8(ptr %p, i32 %off32, i8 %val) #0 {
 ; CHECK-NOLSE-O1-NEXT:    strb w2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_store_relaxed_8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    strb w2, [x0, #4095]
+; CHECK-OUTLINE-O1-NEXT:    strb w2, [x0, w1, sxtw]
+; CHECK-OUTLINE-O1-NEXT:    sturb w2, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    strb w2, [x8]
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_store_relaxed_8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    strb w2, [x0, #4095]
@@ -1020,6 +1417,15 @@ define void @atomic_store_relaxed_8(ptr %p, i32 %off32, i8 %val) #0 {
 ; CHECK-NOLSE-O0-NEXT:    strb w2, [x8]
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_store_relaxed_8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    strb w2, [x0, #4095]
+; CHECK-OUTLINE-O0-NEXT:    strb w2, [x0, w1, sxtw]
+; CHECK-OUTLINE-O0-NEXT:    sturb w2, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    strb w2, [x8]
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_store_relaxed_8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    strb w2, [x0, #4095]
@@ -1062,6 +1468,15 @@ define void @atomic_store_relaxed_16(ptr %p, i32 %off32, i16 %val) #0 {
 ; CHECK-NOLSE-O1-NEXT:    strh w2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_store_relaxed_16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    strh w2, [x0, #8190]
+; CHECK-OUTLINE-O1-NEXT:    strh w2, [x0, w1, sxtw #1]
+; CHECK-OUTLINE-O1-NEXT:    sturh w2, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    strh w2, [x8]
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_store_relaxed_16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    strh w2, [x0, #8190]
@@ -1071,6 +1486,15 @@ define void @atomic_store_relaxed_16(ptr %p, i32 %off32, i16 %val) #0 {
 ; CHECK-NOLSE-O0-NEXT:    strh w2, [x8]
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_store_relaxed_16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    strh w2, [x0, #8190]
+; CHECK-OUTLINE-O0-NEXT:    strh w2, [x0, w1, sxtw #1]
+; CHECK-OUTLINE-O0-NEXT:    sturh w2, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    strh w2, [x8]
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_store_relaxed_16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    strh w2, [x0, #8190]
@@ -1113,6 +1537,15 @@ define void @atomic_store_relaxed_32(ptr %p, i32 %off32, i32 %val) #0 {
 ; CHECK-NOLSE-O1-NEXT:    str w2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_store_relaxed_32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    str w2, [x0, #16380]
+; CHECK-OUTLINE-O1-NEXT:    str w2, [x0, w1, sxtw #2]
+; CHECK-OUTLINE-O1-NEXT:    stur w2, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    str w2, [x8]
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_store_relaxed_32:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    str w2, [x0, #16380]
@@ -1122,6 +1555,15 @@ define void @atomic_store_relaxed_32(ptr %p, i32 %off32, i32 %val) #0 {
 ; CHECK-NOLSE-O0-NEXT:    str w2, [x8]
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_store_relaxed_32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    str w2, [x0, #16380]
+; CHECK-OUTLINE-O0-NEXT:    str w2, [x0, w1, sxtw #2]
+; CHECK-OUTLINE-O0-NEXT:    stur w2, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    str w2, [x8]
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_store_relaxed_32:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    str w2, [x0, #16380]
@@ -1164,6 +1606,15 @@ define void @atomic_store_relaxed_64(ptr %p, i32 %off32, i64 %val) #0 {
 ; CHECK-NOLSE-O1-NEXT:    str x2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_store_relaxed_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    str x2, [x0, #32760]
+; CHECK-OUTLINE-O1-NEXT:    str x2, [x0, w1, sxtw #3]
+; CHECK-OUTLINE-O1-NEXT:    stur x2, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    str x2, [x8]
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_store_relaxed_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    str x2, [x0, #32760]
@@ -1173,6 +1624,15 @@ define void @atomic_store_relaxed_64(ptr %p, i32 %off32, i64 %val) #0 {
 ; CHECK-NOLSE-O0-NEXT:    str x2, [x8]
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_store_relaxed_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    str x2, [x0, #32760]
+; CHECK-OUTLINE-O0-NEXT:    str x2, [x0, w1, sxtw #3]
+; CHECK-OUTLINE-O0-NEXT:    stur x2, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    str x2, [x8]
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_store_relaxed_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    str x2, [x0, #32760]
@@ -1213,6 +1673,13 @@ define i32 @load_zext(ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w9, w8, uxtb
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: load_zext:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldarb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x1]
+; CHECK-OUTLINE-O1-NEXT:    add w0, w9, w8, uxtb
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: load_zext:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldarb w9, [x0]
@@ -1220,6 +1687,13 @@ define i32 @load_zext(ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9, uxtb
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: load_zext:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldarb w9, [x0]
+; CHECK-OUTLINE-O0-NEXT:    ldrh w8, [x1]
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: load_zext:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldaprb w8, [x0]
@@ -1250,6 +1724,12 @@ define { i32, i64 } @load_acq(ptr %p32, ptr %p64) {
 ; CHECK-NOLSE-NEXT:    ldar x1, [x1]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: load_acq:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar w0, [x0]
+; CHECK-OUTLINE-NEXT:    ldar x1, [x1]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: load_acq:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar w0, [x0]
@@ -1279,6 +1759,14 @@ define i32 @load_sext(ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w9, w8, sxtb
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: load_sext:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldarb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x1]
+; CHECK-OUTLINE-O1-NEXT:    sxth w9, w9
+; CHECK-OUTLINE-O1-NEXT:    add w0, w9, w8, sxtb
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: load_sext:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldarb w9, [x0]
@@ -1287,6 +1775,14 @@ define i32 @load_sext(ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9, sxtb
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: load_sext:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldarb w9, [x0]
+; CHECK-OUTLINE-O0-NEXT:    ldrh w8, [x1]
+; CHECK-OUTLINE-O0-NEXT:    sxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9, sxtb
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: load_sext:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldaprb w8, [x0]
@@ -1319,6 +1815,12 @@ define void @store_trunc(i32 %val, ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-NEXT:    strh w0, [x2]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: store_trunc:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    stlrb w0, [x1]
+; CHECK-OUTLINE-NEXT:    strh w0, [x2]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: store_trunc:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    stlrb w0, [x1]
@@ -1352,6 +1854,19 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_add_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd1_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_add_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1392,6 +1907,21 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_add_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd1_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_add_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldaddalb w1, w0, [x0]
@@ -1418,6 +1948,19 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_swp1_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_xchg_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1457,6 +2000,21 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xchg_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_swp1_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_xchg_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    swpb w1, w0, [x0]
@@ -1483,6 +2041,19 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_sub_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    neg w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd1_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_sub_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1523,6 +2094,23 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_sub_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, wzr
+; CHECK-OUTLINE-O0-NEXT:    subs w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd1_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_sub_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    neg w8, w1
@@ -1551,6 +2139,20 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_and_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr1_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_and_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1591,6 +2193,23 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_and_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O0-NEXT:    eor w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldclr1_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_and_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mvn w8, w1
@@ -1619,6 +2238,19 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_or_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset1_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_or_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1659,6 +2291,21 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_or_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset1_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_or_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsetalb w1, w0, [x0]
@@ -1685,6 +2332,19 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xor_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldeor1_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_xor_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1725,6 +2385,21 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xor_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldeor1_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_xor_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldeorb w1, w0, [x0]
@@ -1753,6 +2428,20 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_min_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB33_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxrb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    sxtb w9, w8
+; CHECK-OUTLINE-O1-NEXT:    cmp w9, w1, sxtb
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, le
+; CHECK-OUTLINE-O1-NEXT:    stxrb w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB33_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_min_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1795,6 +2484,42 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_min_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrb w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB33_1
+; CHECK-OUTLINE-O0-NEXT:  LBB33_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    sxtb w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, sxtb
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, le
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_acq
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB33_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB33_2
+; CHECK-OUTLINE-O0-NEXT:  LBB33_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_min_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsminab w1, w0, [x0]
@@ -1823,6 +2548,20 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_max_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB34_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxrb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    sxtb w9, w8
+; CHECK-OUTLINE-O1-NEXT:    cmp w9, w1, sxtb
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, gt
+; CHECK-OUTLINE-O1-NEXT:    stlxrb w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB34_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_max_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1865,6 +2604,42 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_max_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrb w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB34_1
+; CHECK-OUTLINE-O0-NEXT:  LBB34_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    sxtb w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, sxtb
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, gt
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB34_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB34_2
+; CHECK-OUTLINE-O0-NEXT:  LBB34_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_max_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsmaxlb w1, w0, [x0]
@@ -1894,6 +2669,21 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umin_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    and w9, w1, #0xff
+; CHECK-OUTLINE-O1-NEXT:  LBB35_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxrb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xff
+; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, ls
+; CHECK-OUTLINE-O1-NEXT:    stlxrb w11, w10, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB35_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_umin_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1936,6 +2726,42 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umin_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrb w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB35_1
+; CHECK-OUTLINE-O0-NEXT:  LBB35_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w9, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, uxtb
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, ls
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB35_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB35_2
+; CHECK-OUTLINE-O0-NEXT:  LBB35_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_umin_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    lduminalb w1, w0, [x0]
@@ -1965,6 +2791,21 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umax_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    and w9, w1, #0xff
+; CHECK-OUTLINE-O1-NEXT:  LBB36_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxrb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xff
+; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-OUTLINE-O1-NEXT:    stxrb w11, w10, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB36_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_umax_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2007,6 +2848,42 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umax_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrb w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB36_1
+; CHECK-OUTLINE-O0-NEXT:  LBB36_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w9, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, uxtb
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, hi
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB36_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB36_2
+; CHECK-OUTLINE-O0-NEXT:  LBB36_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_umax_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldumaxb w1, w0, [x0]
@@ -2033,6 +2910,19 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_add_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd2_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_add_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2073,6 +2963,21 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_add_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd2_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_add_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldaddalh w1, w0, [x0]
@@ -2099,6 +3004,19 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_swp2_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_xchg_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2138,6 +3056,21 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xchg_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_swp2_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_xchg_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    swph w1, w0, [x0]
@@ -2164,6 +3097,19 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_sub_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    neg w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd2_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_sub_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2204,6 +3150,23 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_sub_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, wzr
+; CHECK-OUTLINE-O0-NEXT:    subs w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd2_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_sub_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    neg w8, w1
@@ -2232,6 +3195,20 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_and_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr2_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_and_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2272,6 +3249,23 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_and_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O0-NEXT:    eor w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldclr2_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_and_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mvn w8, w1
@@ -2300,6 +3294,19 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_or_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset2_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_or_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2340,6 +3347,21 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_or_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset2_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_or_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsetalh w1, w0, [x0]
@@ -2366,6 +3388,19 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xor_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldeor2_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_xor_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2406,6 +3441,21 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xor_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldeor2_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_xor_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldeorh w1, w0, [x0]
@@ -2434,6 +3484,20 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_min_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB43_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxrh w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    sxth w9, w8
+; CHECK-OUTLINE-O1-NEXT:    cmp w9, w1, sxth
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, le
+; CHECK-OUTLINE-O1-NEXT:    stxrh w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB43_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_min_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2476,6 +3540,42 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_min_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrh w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB43_1
+; CHECK-OUTLINE-O0-NEXT:  LBB43_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    sxth w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, sxth
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, le
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_acq
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w0, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB43_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB43_2
+; CHECK-OUTLINE-O0-NEXT:  LBB43_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_min_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsminah w1, w0, [x0]
@@ -2504,6 +3604,20 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_max_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB44_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxrh w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    sxth w9, w8
+; CHECK-OUTLINE-O1-NEXT:    cmp w9, w1, sxth
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, gt
+; CHECK-OUTLINE-O1-NEXT:    stlxrh w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB44_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_max_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2546,6 +3660,42 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_max_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrh w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB44_1
+; CHECK-OUTLINE-O0-NEXT:  LBB44_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    sxth w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, sxth
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, gt
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w0, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB44_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB44_2
+; CHECK-OUTLINE-O0-NEXT:  LBB44_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_max_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsmaxlh w1, w0, [x0]
@@ -2575,6 +3725,21 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umin_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    and w9, w1, #0xffff
+; CHECK-OUTLINE-O1-NEXT:  LBB45_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxrh w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, ls
+; CHECK-OUTLINE-O1-NEXT:    stlxrh w11, w10, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB45_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_umin_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2617,6 +3782,42 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umin_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrh w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB45_1
+; CHECK-OUTLINE-O0-NEXT:  LBB45_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, uxth
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, ls
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w0, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB45_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB45_2
+; CHECK-OUTLINE-O0-NEXT:  LBB45_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_umin_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    lduminalh w1, w0, [x0]
@@ -2646,6 +3847,21 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umax_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    and w9, w1, #0xffff
+; CHECK-OUTLINE-O1-NEXT:  LBB46_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxrh w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-OUTLINE-O1-NEXT:    stxrh w11, w10, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB46_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_umax_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2688,6 +3904,42 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umax_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrh w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB46_1
+; CHECK-OUTLINE-O0-NEXT:  LBB46_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, uxth
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, hi
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w0, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB46_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB46_2
+; CHECK-OUTLINE-O0-NEXT:  LBB46_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_umax_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldumaxh w1, w0, [x0]
@@ -2701,145 +3953,2576 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ret i16 %res
 }
 
-define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
-; CHECK-NOLSE-O1-LABEL: cmpxchg_i8:
+define i32 @atomicrmw_add_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_add_i32:
 ; CHECK-NOLSE-O1:       ; %bb.0:
-; CHECK-NOLSE-O1-NEXT:    mov x8, x0
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w2 killed $w2 def $x2
-; CHECK-NOLSE-O1-NEXT:  LBB47_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:  LBB47_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NOLSE-O1-NEXT:    ldxrb w0, [x8]
-; CHECK-NOLSE-O1-NEXT:    and w9, w0, #0xff
-; CHECK-NOLSE-O1-NEXT:    cmp w9, w1, uxtb
-; CHECK-NOLSE-O1-NEXT:    b.ne LBB47_4
-; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
-; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB47_1 Depth=1
-; CHECK-NOLSE-O1-NEXT:    stxrb w9, w2, [x8]
-; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB47_1
-; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
-; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
-; CHECK-NOLSE-O1-NEXT:    ret
-; CHECK-NOLSE-O1-NEXT:  LBB47_4: ; %cmpxchg.nostore
-; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
-; CHECK-NOLSE-O1-NEXT:    clrex
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    add w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB47_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
-; CHECK-NOLSE-O0-LABEL: cmpxchg_i8:
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_add_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd4_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_add_i32:
 ; CHECK-NOLSE-O0:       ; %bb.0:
-; CHECK-NOLSE-O0-NEXT:    mov x9, x0
-; CHECK-NOLSE-O0-NEXT:  LBB47_1: ; =>This Inner Loop Header: Depth=1
-; CHECK-NOLSE-O0-NEXT:    ldaxrb w0, [x9]
-; CHECK-NOLSE-O0-NEXT:    cmp w0, w1, uxtb
-; CHECK-NOLSE-O0-NEXT:    b.ne LBB47_3
-; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB47_1 Depth=1
-; CHECK-NOLSE-O0-NEXT:    stlxrb w8, w2, [x9]
-; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB47_1
-; CHECK-NOLSE-O0-NEXT:  LBB47_3:
-; CHECK-NOLSE-O0-NEXT:    and w8, w0, #0xff
-; CHECK-NOLSE-O0-NEXT:    subs w8, w8, w1, uxtb
-; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB47_1
+; CHECK-NOLSE-O0-NEXT:  LBB47_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB47_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB47_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB47_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB47_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB47_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB47_2
+; CHECK-NOLSE-O0-NEXT:  LBB47_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB47_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB47_1
+; CHECK-NOLSE-O0-NEXT:    b LBB47_5
+; CHECK-NOLSE-O0-NEXT:  LBB47_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
-; CHECK-LSE-O1-LABEL: cmpxchg_i8:
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_add_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd4_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_add_i32:
 ; CHECK-LSE-O1:       ; %bb.0:
-; CHECK-LSE-O1-NEXT:    mov x8, x1
-; CHECK-LSE-O1-NEXT:    casb w8, w2, [x0]
-; CHECK-LSE-O1-NEXT:    and w9, w8, #0xff
-; CHECK-LSE-O1-NEXT:    cmp w9, w1, uxtb
-; CHECK-LSE-O1-NEXT:    cset w1, eq
-; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ldaddal w1, w0, [x0]
 ; CHECK-LSE-O1-NEXT:    ret
 ;
-; CHECK-LSE-O0-LABEL: cmpxchg_i8:
+; CHECK-LSE-O0-LABEL: atomicrmw_add_i32:
 ; CHECK-LSE-O0:       ; %bb.0:
-; CHECK-LSE-O0-NEXT:    mov x8, x0
-; CHECK-LSE-O0-NEXT:    mov x0, x1
-; CHECK-LSE-O0-NEXT:    casb w0, w2, [x8]
-; CHECK-LSE-O0-NEXT:    and w8, w0, #0xff
-; CHECK-LSE-O0-NEXT:    subs w8, w8, w1, uxtb
-; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ldaddal w1, w0, [x0]
 ; CHECK-LSE-O0-NEXT:    ret
-  %res = cmpxchg ptr %ptr, i8 %desired, i8 %new monotonic monotonic
-  ret { i8, i1 } %res
+  %res = atomicrmw add ptr %ptr, i32 %rhs seq_cst
+  ret i32 %res
 }
 
-define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
-; CHECK-NOLSE-O1-LABEL: cmpxchg_i16:
+define i32 @atomicrmw_xchg_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_xchg_i32:
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:    mov x8, x0
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w2 killed $w2 def $x2
-; CHECK-NOLSE-O1-NEXT:  LBB48_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:  LBB48_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NOLSE-O1-NEXT:    ldxrh w0, [x8]
-; CHECK-NOLSE-O1-NEXT:    and w9, w0, #0xffff
-; CHECK-NOLSE-O1-NEXT:    cmp w9, w1, uxth
-; CHECK-NOLSE-O1-NEXT:    b.ne LBB48_4
-; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
-; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB48_1 Depth=1
-; CHECK-NOLSE-O1-NEXT:    stxrh w9, w2, [x8]
+; CHECK-NOLSE-O1-NEXT:    ldxr w0, [x8]
+; CHECK-NOLSE-O1-NEXT:    stxr w9, w1, [x8]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB48_1
-; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
-; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
-; CHECK-NOLSE-O1-NEXT:    ret
-; CHECK-NOLSE-O1-NEXT:  LBB48_4: ; %cmpxchg.nostore
-; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
-; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
-; CHECK-NOLSE-O0-LABEL: cmpxchg_i16:
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_swp4_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_xchg_i32:
 ; CHECK-NOLSE-O0:       ; %bb.0:
-; CHECK-NOLSE-O0-NEXT:    mov x9, x0
-; CHECK-NOLSE-O0-NEXT:  LBB48_1: ; =>This Inner Loop Header: Depth=1
-; CHECK-NOLSE-O0-NEXT:    ldaxrh w0, [x9]
-; CHECK-NOLSE-O0-NEXT:    cmp w0, w1, uxth
-; CHECK-NOLSE-O0-NEXT:    b.ne LBB48_3
-; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB48_1 Depth=1
-; CHECK-NOLSE-O0-NEXT:    stlxrh w8, w2, [x9]
-; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB48_1
-; CHECK-NOLSE-O0-NEXT:  LBB48_3:
-; CHECK-NOLSE-O0-NEXT:    and w8, w0, #0xffff
-; CHECK-NOLSE-O0-NEXT:    subs w8, w8, w1, uxth
-; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB48_1
+; CHECK-NOLSE-O0-NEXT:  LBB48_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB48_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w12, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:  LBB48_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB48_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB48_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB48_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB48_2
+; CHECK-NOLSE-O0-NEXT:  LBB48_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB48_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB48_1
+; CHECK-NOLSE-O0-NEXT:    b LBB48_5
+; CHECK-NOLSE-O0-NEXT:  LBB48_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
-; CHECK-LSE-O1-LABEL: cmpxchg_i16:
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xchg_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_swp4_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_xchg_i32:
 ; CHECK-LSE-O1:       ; %bb.0:
-; CHECK-LSE-O1-NEXT:    mov x8, x1
-; CHECK-LSE-O1-NEXT:    cash w8, w2, [x0]
-; CHECK-LSE-O1-NEXT:    and w9, w8, #0xffff
-; CHECK-LSE-O1-NEXT:    cmp w9, w1, uxth
-; CHECK-LSE-O1-NEXT:    cset w1, eq
-; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    swp w1, w0, [x0]
 ; CHECK-LSE-O1-NEXT:    ret
 ;
-; CHECK-LSE-O0-LABEL: cmpxchg_i16:
+; CHECK-LSE-O0-LABEL: atomicrmw_xchg_i32:
 ; CHECK-LSE-O0:       ; %bb.0:
-; CHECK-LSE-O0-NEXT:    mov x8, x0
-; CHECK-LSE-O0-NEXT:    mov x0, x1
-; CHECK-LSE-O0-NEXT:    cash w0, w2, [x8]
-; CHECK-LSE-O0-NEXT:    and w8, w0, #0xffff
-; CHECK-LSE-O0-NEXT:    subs w8, w8, w1, uxth
-; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    swp w1, w0, [x0]
 ; CHECK-LSE-O0-NEXT:    ret
-  %res = cmpxchg ptr %ptr, i16 %desired, i16 %new monotonic monotonic
-  ret { i16, i1 } %res
+  %res = atomicrmw xchg ptr %ptr, i32 %rhs monotonic
+  ret i32 %res
 }
 
-define internal double @bitcast_to_double(ptr %ptr) {
-; CHECK-NOLSE-LABEL: bitcast_to_double:
-; CHECK-NOLSE:       ; %bb.0:
-; CHECK-NOLSE-NEXT:    ldar x8, [x0]
-; CHECK-NOLSE-NEXT:    fmov d0, x8
-; CHECK-NOLSE-NEXT:    ret
+define i32 @atomicrmw_sub_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_sub_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB49_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    sub w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB49_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
 ;
-; CHECK-LSE-O1-LABEL: bitcast_to_double:
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_sub_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    neg w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd4_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_sub_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB49_1
+; CHECK-NOLSE-O0-NEXT:  LBB49_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB49_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB49_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB49_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB49_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB49_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB49_2
+; CHECK-NOLSE-O0-NEXT:  LBB49_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB49_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB49_1
+; CHECK-NOLSE-O0-NEXT:    b LBB49_5
+; CHECK-NOLSE-O0-NEXT:  LBB49_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_sub_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, wzr
+; CHECK-OUTLINE-O0-NEXT:    subs w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd4_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_sub_i32:
 ; CHECK-LSE-O1:       ; %bb.0:
-; CHECK-LSE-O1-NEXT:    ldar x8, [x0]
-; CHECK-LSE-O1-NEXT:    fmov d0, x8
+; CHECK-LSE-O1-NEXT:    neg w8, w1
+; CHECK-LSE-O1-NEXT:    ldadda w8, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_sub_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    neg w8, w1
+; CHECK-LSE-O0-NEXT:    ldadda w8, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw sub ptr %ptr, i32 %rhs acquire
+  ret i32 %res
+}
+
+define i32 @atomicrmw_and_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_and_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB50_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    and w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB50_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_and_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr4_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_and_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB50_1
+; CHECK-NOLSE-O0-NEXT:  LBB50_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB50_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    and w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB50_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB50_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB50_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB50_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB50_2
+; CHECK-NOLSE-O0-NEXT:  LBB50_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB50_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB50_1
+; CHECK-NOLSE-O0-NEXT:    b LBB50_5
+; CHECK-NOLSE-O0-NEXT:  LBB50_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_and_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O0-NEXT:    eor w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldclr4_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_and_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mvn w8, w1
+; CHECK-LSE-O1-NEXT:    ldclrl w8, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_and_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mvn w8, w1
+; CHECK-LSE-O0-NEXT:    ldclrl w8, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw and ptr %ptr, i32 %rhs release
+  ret i32 %res
+}
+
+define i32 @atomicrmw_or_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_or_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB51_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    orr w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB51_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_or_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset4_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_or_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB51_1
+; CHECK-NOLSE-O0-NEXT:  LBB51_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB51_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    orr w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB51_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB51_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB51_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB51_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB51_2
+; CHECK-NOLSE-O0-NEXT:  LBB51_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB51_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB51_1
+; CHECK-NOLSE-O0-NEXT:    b LBB51_5
+; CHECK-NOLSE-O0-NEXT:  LBB51_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_or_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset4_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_or_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsetal w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_or_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsetal w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw or ptr %ptr, i32 %rhs seq_cst
+  ret i32 %res
+}
+
+define i32 @atomicrmw_xor_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_xor_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB52_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    eor w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB52_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xor_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldeor4_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_xor_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB52_1
+; CHECK-NOLSE-O0-NEXT:  LBB52_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB52_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    eor w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB52_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB52_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB52_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB52_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB52_2
+; CHECK-NOLSE-O0-NEXT:  LBB52_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB52_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB52_1
+; CHECK-NOLSE-O0-NEXT:    b LBB52_5
+; CHECK-NOLSE-O0-NEXT:  LBB52_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xor_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldeor4_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_xor_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldeor w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_xor_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldeor w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw xor ptr %ptr, i32 %rhs monotonic
+  ret i32 %res
+}
+
+define i32 @atomicrmw_min_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_min_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB53_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w1
+; CHECK-NOLSE-O1-NEXT:    csel w9, w8, w1, le
+; CHECK-NOLSE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB53_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_min_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB53_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w1
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, le
+; CHECK-OUTLINE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB53_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_min_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB53_1
+; CHECK-NOLSE-O0-NEXT:  LBB53_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB53_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w10, w8, w9
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, le
+; CHECK-NOLSE-O0-NEXT:  LBB53_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB53_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB53_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB53_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB53_2
+; CHECK-NOLSE-O0-NEXT:  LBB53_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB53_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB53_1
+; CHECK-NOLSE-O0-NEXT:    b LBB53_5
+; CHECK-NOLSE-O0-NEXT:  LBB53_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_min_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB53_1
+; CHECK-OUTLINE-O0-NEXT:  LBB53_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, le
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB53_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB53_2
+; CHECK-OUTLINE-O0-NEXT:  LBB53_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_min_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsmina w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_min_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsmina w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw min ptr %ptr, i32 %rhs acquire
+  ret i32 %res
+}
+
+define i32 @atomicrmw_max_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_max_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB54_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w1
+; CHECK-NOLSE-O1-NEXT:    csel w9, w8, w1, gt
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB54_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_max_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB54_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w1
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, gt
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB54_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_max_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB54_1
+; CHECK-NOLSE-O0-NEXT:  LBB54_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB54_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w10, w8, w9
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, gt
+; CHECK-NOLSE-O0-NEXT:  LBB54_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB54_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB54_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB54_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB54_2
+; CHECK-NOLSE-O0-NEXT:  LBB54_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB54_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB54_1
+; CHECK-NOLSE-O0-NEXT:    b LBB54_5
+; CHECK-NOLSE-O0-NEXT:  LBB54_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_max_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB54_1
+; CHECK-OUTLINE-O0-NEXT:  LBB54_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, gt
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB54_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB54_2
+; CHECK-OUTLINE-O0-NEXT:  LBB54_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_max_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsmaxl w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_max_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsmaxl w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw max ptr %ptr, i32 %rhs release
+  ret i32 %res
+}
+
+define i32 @atomicrmw_umin_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_umin_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB55_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w1
+; CHECK-NOLSE-O1-NEXT:    csel w9, w8, w1, ls
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB55_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umin_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB55_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w1
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, ls
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB55_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_umin_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB55_1
+; CHECK-NOLSE-O0-NEXT:  LBB55_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB55_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w10, w8, w9
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, ls
+; CHECK-NOLSE-O0-NEXT:  LBB55_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB55_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB55_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB55_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB55_2
+; CHECK-NOLSE-O0-NEXT:  LBB55_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB55_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB55_1
+; CHECK-NOLSE-O0-NEXT:    b LBB55_5
+; CHECK-NOLSE-O0-NEXT:  LBB55_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umin_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB55_1
+; CHECK-OUTLINE-O0-NEXT:  LBB55_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, ls
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB55_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB55_2
+; CHECK-OUTLINE-O0-NEXT:  LBB55_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_umin_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    lduminal w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_umin_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    lduminal w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw umin ptr %ptr, i32 %rhs seq_cst
+  ret i32 %res
+}
+
+define i32 @atomicrmw_umax_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_umax_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB56_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w1
+; CHECK-NOLSE-O1-NEXT:    csel w9, w8, w1, hi
+; CHECK-NOLSE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB56_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umax_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB56_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w1
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, hi
+; CHECK-OUTLINE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB56_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_umax_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB56_1
+; CHECK-NOLSE-O0-NEXT:  LBB56_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB56_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w10, w8, w9
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, hi
+; CHECK-NOLSE-O0-NEXT:  LBB56_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB56_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB56_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB56_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB56_2
+; CHECK-NOLSE-O0-NEXT:  LBB56_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB56_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB56_1
+; CHECK-NOLSE-O0-NEXT:    b LBB56_5
+; CHECK-NOLSE-O0-NEXT:  LBB56_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umax_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB56_1
+; CHECK-OUTLINE-O0-NEXT:  LBB56_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, hi
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB56_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB56_2
+; CHECK-OUTLINE-O0-NEXT:  LBB56_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_umax_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldumax w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_umax_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldumax w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw umax ptr %ptr, i32 %rhs monotonic
+  ret i32 %res
+}
+
+define i64 @atomicrmw_add_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_add_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB57_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    add x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB57_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_add_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd8_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_add_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB57_1
+; CHECK-NOLSE-O0-NEXT:  LBB57_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB57_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB57_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB57_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB57_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB57_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB57_2
+; CHECK-NOLSE-O0-NEXT:  LBB57_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB57_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB57_1
+; CHECK-NOLSE-O0-NEXT:    b LBB57_5
+; CHECK-NOLSE-O0-NEXT:  LBB57_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_add_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_add_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldaddal x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_add_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldaddal x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw add ptr %ptr, i64 %rhs seq_cst
+  ret i64 %res
+}
+
+define i64 @atomicrmw_xchg_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_xchg_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB58_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    stxr w9, x1, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB58_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_swp8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_xchg_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB58_1
+; CHECK-NOLSE-O0-NEXT:  LBB58_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB58_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x12, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:  LBB58_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB58_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB58_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB58_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB58_2
+; CHECK-NOLSE-O0-NEXT:  LBB58_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB58_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB58_1
+; CHECK-NOLSE-O0-NEXT:    b LBB58_5
+; CHECK-NOLSE-O0-NEXT:  LBB58_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xchg_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_swp8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_xchg_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    swp x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_xchg_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    swp x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw xchg ptr %ptr, i64 %rhs monotonic
+  ret i64 %res
+}
+
+define i64 @atomicrmw_sub_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_sub_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB59_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    sub x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB59_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_sub_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    neg x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd8_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_sub_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB59_1
+; CHECK-NOLSE-O0-NEXT:  LBB59_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB59_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB59_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB59_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB59_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB59_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB59_2
+; CHECK-NOLSE-O0-NEXT:  LBB59_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB59_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB59_1
+; CHECK-NOLSE-O0-NEXT:    b LBB59_5
+; CHECK-NOLSE-O0-NEXT:  LBB59_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_sub_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x9, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov x8, xzr
+; CHECK-OUTLINE-O0-NEXT:    subs x0, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd8_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_sub_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    neg x8, x1
+; CHECK-LSE-O1-NEXT:    ldadda x8, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_sub_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    neg x8, x1
+; CHECK-LSE-O0-NEXT:    ldadda x8, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw sub ptr %ptr, i64 %rhs acquire
+  ret i64 %res
+}
+
+define i64 @atomicrmw_and_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_and_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB60_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    and x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB60_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_and_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x8, #-1 ; =0xffffffffffffffff
+; CHECK-OUTLINE-O1-NEXT:    eor x0, x8, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr8_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_and_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB60_1
+; CHECK-NOLSE-O0-NEXT:  LBB60_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB60_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    and x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB60_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB60_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB60_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB60_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB60_2
+; CHECK-NOLSE-O0-NEXT:  LBB60_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB60_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB60_1
+; CHECK-NOLSE-O0-NEXT:    b LBB60_5
+; CHECK-NOLSE-O0-NEXT:  LBB60_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_and_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x9, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov x8, #-1 ; =0xffffffffffffffff
+; CHECK-OUTLINE-O0-NEXT:    eor x0, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldclr8_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_and_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mvn x8, x1
+; CHECK-LSE-O1-NEXT:    ldclrl x8, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_and_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mvn x8, x1
+; CHECK-LSE-O0-NEXT:    ldclrl x8, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw and ptr %ptr, i64 %rhs release
+  ret i64 %res
+}
+
+define i64 @atomicrmw_or_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_or_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB61_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    orr x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB61_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_or_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset8_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_or_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB61_1
+; CHECK-NOLSE-O0-NEXT:  LBB61_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB61_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    orr x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB61_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB61_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB61_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB61_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB61_2
+; CHECK-NOLSE-O0-NEXT:  LBB61_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB61_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB61_1
+; CHECK-NOLSE-O0-NEXT:    b LBB61_5
+; CHECK-NOLSE-O0-NEXT:  LBB61_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_or_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_or_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsetal x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_or_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsetal x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw or ptr %ptr, i64 %rhs seq_cst
+  ret i64 %res
+}
+
+define i64 @atomicrmw_xor_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_xor_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB62_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    eor x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB62_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xor_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldeor8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_xor_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB62_1
+; CHECK-NOLSE-O0-NEXT:  LBB62_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB62_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    eor x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB62_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB62_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB62_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB62_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB62_2
+; CHECK-NOLSE-O0-NEXT:  LBB62_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB62_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB62_1
+; CHECK-NOLSE-O0-NEXT:    b LBB62_5
+; CHECK-NOLSE-O0-NEXT:  LBB62_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xor_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldeor8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_xor_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldeor x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_xor_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldeor x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw xor ptr %ptr, i64 %rhs monotonic
+  ret i64 %res
+}
+
+define i64 @atomicrmw_min_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_min_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB63_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
+; CHECK-NOLSE-O1-NEXT:    csel x9, x8, x1, le
+; CHECK-NOLSE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB63_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_min_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB63_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp x8, x1
+; CHECK-OUTLINE-O1-NEXT:    csel x9, x8, x1, le
+; CHECK-OUTLINE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB63_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_min_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB63_1
+; CHECK-NOLSE-O0-NEXT:  LBB63_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB63_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x10, x8, x9
+; CHECK-NOLSE-O0-NEXT:    csel x12, x8, x9, le
+; CHECK-NOLSE-O0-NEXT:  LBB63_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB63_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB63_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB63_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB63_2
+; CHECK-NOLSE-O0-NEXT:  LBB63_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB63_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB63_1
+; CHECK-NOLSE-O0-NEXT:    b LBB63_5
+; CHECK-NOLSE-O0-NEXT:  LBB63_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_min_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x1, [sp, #32] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB63_1
+; CHECK-OUTLINE-O0-NEXT:  LBB63_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #32] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    csel x1, x0, x8, le
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB63_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB63_2
+; CHECK-OUTLINE-O0-NEXT:  LBB63_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_min_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsmina x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_min_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsmina x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw min ptr %ptr, i64 %rhs acquire
+  ret i64 %res
+}
+
+define i64 @atomicrmw_max_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_max_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB64_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
+; CHECK-NOLSE-O1-NEXT:    csel x9, x8, x1, gt
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB64_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_max_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB64_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp x8, x1
+; CHECK-OUTLINE-O1-NEXT:    csel x9, x8, x1, gt
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB64_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_max_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB64_1
+; CHECK-NOLSE-O0-NEXT:  LBB64_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB64_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x10, x8, x9
+; CHECK-NOLSE-O0-NEXT:    csel x12, x8, x9, gt
+; CHECK-NOLSE-O0-NEXT:  LBB64_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB64_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB64_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB64_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB64_2
+; CHECK-NOLSE-O0-NEXT:  LBB64_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB64_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB64_1
+; CHECK-NOLSE-O0-NEXT:    b LBB64_5
+; CHECK-NOLSE-O0-NEXT:  LBB64_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_max_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x1, [sp, #32] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB64_1
+; CHECK-OUTLINE-O0-NEXT:  LBB64_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #32] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    csel x1, x0, x8, gt
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB64_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB64_2
+; CHECK-OUTLINE-O0-NEXT:  LBB64_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_max_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsmaxl x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_max_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsmaxl x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw max ptr %ptr, i64 %rhs release
+  ret i64 %res
+}
+
+define i64 @atomicrmw_umin_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_umin_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB65_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
+; CHECK-NOLSE-O1-NEXT:    csel x9, x8, x1, ls
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB65_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umin_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB65_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp x8, x1
+; CHECK-OUTLINE-O1-NEXT:    csel x9, x8, x1, ls
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB65_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_umin_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB65_1
+; CHECK-NOLSE-O0-NEXT:  LBB65_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB65_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x10, x8, x9
+; CHECK-NOLSE-O0-NEXT:    csel x12, x8, x9, ls
+; CHECK-NOLSE-O0-NEXT:  LBB65_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB65_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB65_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB65_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB65_2
+; CHECK-NOLSE-O0-NEXT:  LBB65_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB65_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB65_1
+; CHECK-NOLSE-O0-NEXT:    b LBB65_5
+; CHECK-NOLSE-O0-NEXT:  LBB65_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umin_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x1, [sp, #32] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB65_1
+; CHECK-OUTLINE-O0-NEXT:  LBB65_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #32] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    csel x1, x0, x8, ls
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB65_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB65_2
+; CHECK-OUTLINE-O0-NEXT:  LBB65_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_umin_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    lduminal x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_umin_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    lduminal x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw umin ptr %ptr, i64 %rhs seq_cst
+  ret i64 %res
+}
+
+define i64 @atomicrmw_umax_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_umax_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB66_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
+; CHECK-NOLSE-O1-NEXT:    csel x9, x8, x1, hi
+; CHECK-NOLSE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB66_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umax_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB66_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp x8, x1
+; CHECK-OUTLINE-O1-NEXT:    csel x9, x8, x1, hi
+; CHECK-OUTLINE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB66_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_umax_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB66_1
+; CHECK-NOLSE-O0-NEXT:  LBB66_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB66_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x10, x8, x9
+; CHECK-NOLSE-O0-NEXT:    csel x12, x8, x9, hi
+; CHECK-NOLSE-O0-NEXT:  LBB66_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB66_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB66_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB66_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB66_2
+; CHECK-NOLSE-O0-NEXT:  LBB66_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB66_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB66_1
+; CHECK-NOLSE-O0-NEXT:    b LBB66_5
+; CHECK-NOLSE-O0-NEXT:  LBB66_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umax_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x1, [sp, #32] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB66_1
+; CHECK-OUTLINE-O0-NEXT:  LBB66_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #32] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    csel x1, x0, x8, hi
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB66_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB66_2
+; CHECK-OUTLINE-O0-NEXT:  LBB66_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_umax_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldumax x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_umax_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldumax x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw umax ptr %ptr, i64 %rhs monotonic
+  ret i64 %res
+}
+
+define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
+; CHECK-NOLSE-O1-LABEL: cmpxchg_i8:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w2 killed $w2 def $x2
+; CHECK-NOLSE-O1-NEXT:  LBB67_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxrb w0, [x8]
+; CHECK-NOLSE-O1-NEXT:    and w9, w0, #0xff
+; CHECK-NOLSE-O1-NEXT:    cmp w9, w1, uxtb
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB67_4
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB67_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    stxrb w9, w2, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB67_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+; CHECK-NOLSE-O1-NEXT:  LBB67_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
+; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas1_relax
+; CHECK-OUTLINE-O1-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w19, uxtb
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: cmpxchg_i8:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    mov x9, x0
+; CHECK-NOLSE-O0-NEXT:  LBB67_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ldaxrb w0, [x9]
+; CHECK-NOLSE-O0-NEXT:    cmp w0, w1, uxtb
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB67_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB67_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    stlxrb w8, w2, [x9]
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB67_1
+; CHECK-NOLSE-O0-NEXT:  LBB67_3:
+; CHECK-NOLSE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-NOLSE-O0-NEXT:    subs w8, w8, w1, uxtb
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w1, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w1, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: cmpxchg_i8:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mov x8, x1
+; CHECK-LSE-O1-NEXT:    casb w8, w2, [x0]
+; CHECK-LSE-O1-NEXT:    and w9, w8, #0xff
+; CHECK-LSE-O1-NEXT:    cmp w9, w1, uxtb
+; CHECK-LSE-O1-NEXT:    cset w1, eq
+; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: cmpxchg_i8:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mov x8, x0
+; CHECK-LSE-O0-NEXT:    mov x0, x1
+; CHECK-LSE-O0-NEXT:    casb w0, w2, [x8]
+; CHECK-LSE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-LSE-O0-NEXT:    subs w8, w8, w1, uxtb
+; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ret
+  %res = cmpxchg ptr %ptr, i8 %desired, i8 %new monotonic monotonic
+  ret { i8, i1 } %res
+}
+
+define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
+; CHECK-NOLSE-O1-LABEL: cmpxchg_i16:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w2 killed $w2 def $x2
+; CHECK-NOLSE-O1-NEXT:  LBB68_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxrh w0, [x8]
+; CHECK-NOLSE-O1-NEXT:    and w9, w0, #0xffff
+; CHECK-NOLSE-O1-NEXT:    cmp w9, w1, uxth
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB68_4
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB68_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    stxrh w9, w2, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB68_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+; CHECK-NOLSE-O1-NEXT:  LBB68_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
+; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas2_relax
+; CHECK-OUTLINE-O1-NEXT:    and w8, w0, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w19, uxth
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: cmpxchg_i16:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    mov x9, x0
+; CHECK-NOLSE-O0-NEXT:  LBB68_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ldaxrh w0, [x9]
+; CHECK-NOLSE-O0-NEXT:    cmp w0, w1, uxth
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB68_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB68_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    stlxrh w8, w2, [x9]
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB68_1
+; CHECK-NOLSE-O0-NEXT:  LBB68_3:
+; CHECK-NOLSE-O0-NEXT:    and w8, w0, #0xffff
+; CHECK-NOLSE-O0-NEXT:    subs w8, w8, w1, uxth
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w1, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xffff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w1, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: cmpxchg_i16:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mov x8, x1
+; CHECK-LSE-O1-NEXT:    cash w8, w2, [x0]
+; CHECK-LSE-O1-NEXT:    and w9, w8, #0xffff
+; CHECK-LSE-O1-NEXT:    cmp w9, w1, uxth
+; CHECK-LSE-O1-NEXT:    cset w1, eq
+; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: cmpxchg_i16:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mov x8, x0
+; CHECK-LSE-O0-NEXT:    mov x0, x1
+; CHECK-LSE-O0-NEXT:    cash w0, w2, [x8]
+; CHECK-LSE-O0-NEXT:    and w8, w0, #0xffff
+; CHECK-LSE-O0-NEXT:    subs w8, w8, w1, uxth
+; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ret
+  %res = cmpxchg ptr %ptr, i16 %desired, i16 %new monotonic monotonic
+  ret { i16, i1 } %res
+}
+
+define { i32, i1 } @cmpxchg_i32(ptr %ptr, i32 %desired, i32 %new) {
+; CHECK-NOLSE-O1-LABEL: cmpxchg_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:  LBB69_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w0, [x8]
+; CHECK-NOLSE-O1-NEXT:    cmp w0, w1
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB69_4
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB69_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    stxr w9, w2, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB69_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+; CHECK-NOLSE-O1-NEXT:  LBB69_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
+; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp w0, w19
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: cmpxchg_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    mov x9, x0
+; CHECK-NOLSE-O0-NEXT:  LBB69_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ldaxr w0, [x9]
+; CHECK-NOLSE-O0-NEXT:    cmp w0, w1
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB69_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB69_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    stlxr w8, w2, [x9]
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB69_1
+; CHECK-NOLSE-O0-NEXT:  LBB69_3:
+; CHECK-NOLSE-O0-NEXT:    subs w8, w0, w1
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w1, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w1
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: cmpxchg_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mov x8, x1
+; CHECK-LSE-O1-NEXT:    cas w8, w2, [x0]
+; CHECK-LSE-O1-NEXT:    cmp w8, w1
+; CHECK-LSE-O1-NEXT:    cset w1, eq
+; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: cmpxchg_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mov x8, x0
+; CHECK-LSE-O0-NEXT:    mov x0, x1
+; CHECK-LSE-O0-NEXT:    cas w0, w2, [x8]
+; CHECK-LSE-O0-NEXT:    subs w8, w0, w1
+; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ret
+  %res = cmpxchg ptr %ptr, i32 %desired, i32 %new monotonic monotonic
+  ret { i32, i1 } %res
+}
+
+define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) {
+; CHECK-NOLSE-O1-LABEL: cmpxchg_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:  LBB70_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x0, [x8]
+; CHECK-NOLSE-O1-NEXT:    cmp x0, x1
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB70_4
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB70_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    stxr w9, x2, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB70_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NOLSE-O1-NEXT:    ret
+; CHECK-NOLSE-O1-NEXT:  LBB70_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
+; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp x0, x19
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: cmpxchg_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    mov x9, x0
+; CHECK-NOLSE-O0-NEXT:  LBB70_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ldaxr x0, [x9]
+; CHECK-NOLSE-O0-NEXT:    cmp x0, x1
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB70_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB70_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    stlxr w8, x2, [x9]
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB70_1
+; CHECK-NOLSE-O0-NEXT:  LBB70_3:
+; CHECK-NOLSE-O0-NEXT:    subs x8, x0, x1
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x1
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: cmpxchg_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mov x8, x1
+; CHECK-LSE-O1-NEXT:    cas x8, x2, [x0]
+; CHECK-LSE-O1-NEXT:    cmp x8, x1
+; CHECK-LSE-O1-NEXT:    cset w1, eq
+; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: cmpxchg_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mov x8, x0
+; CHECK-LSE-O0-NEXT:    mov x0, x1
+; CHECK-LSE-O0-NEXT:    cas x0, x2, [x8]
+; CHECK-LSE-O0-NEXT:    subs x8, x0, x1
+; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ret
+  %res = cmpxchg ptr %ptr, i64 %desired, i64 %new monotonic monotonic
+  ret { i64, i1 } %res
+}
+
+define { ptr, i1 } @cmpxchg_ptr(ptr %ptr, ptr %desired, ptr %new) {
+; CHECK-NOLSE-O1-LABEL: cmpxchg_ptr:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:  LBB71_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x0, [x8]
+; CHECK-NOLSE-O1-NEXT:    cmp x0, x1
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB71_4
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB71_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    stxr w9, x2, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB71_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NOLSE-O1-NEXT:    ret
+; CHECK-NOLSE-O1-NEXT:  LBB71_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
+; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_ptr:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp x0, x19
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: cmpxchg_ptr:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    mov x9, x0
+; CHECK-NOLSE-O0-NEXT:  LBB71_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ldaxr x0, [x9]
+; CHECK-NOLSE-O0-NEXT:    cmp x0, x1
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB71_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB71_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    stlxr w8, x2, [x9]
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB71_1
+; CHECK-NOLSE-O0-NEXT:  LBB71_3:
+; CHECK-NOLSE-O0-NEXT:    subs x8, x0, x1
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_ptr:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x1
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: cmpxchg_ptr:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mov x8, x1
+; CHECK-LSE-O1-NEXT:    cas x8, x2, [x0]
+; CHECK-LSE-O1-NEXT:    cmp x8, x1
+; CHECK-LSE-O1-NEXT:    cset w1, eq
+; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: cmpxchg_ptr:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mov x8, x0
+; CHECK-LSE-O0-NEXT:    mov x0, x1
+; CHECK-LSE-O0-NEXT:    cas x0, x2, [x8]
+; CHECK-LSE-O0-NEXT:    subs x8, x0, x1
+; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ret
+  %res = cmpxchg ptr %ptr, ptr %desired, ptr %new monotonic monotonic
+  ret { ptr, i1 } %res
+}
+
+define internal double @bitcast_to_double(ptr %ptr) {
+; CHECK-NOLSE-LABEL: bitcast_to_double:
+; CHECK-NOLSE:       ; %bb.0:
+; CHECK-NOLSE-NEXT:    ldar x8, [x0]
+; CHECK-NOLSE-NEXT:    fmov d0, x8
+; CHECK-NOLSE-NEXT:    ret
+;
+; CHECK-OUTLINE-LABEL: bitcast_to_double:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar x8, [x0]
+; CHECK-OUTLINE-NEXT:    fmov d0, x8
+; CHECK-OUTLINE-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: bitcast_to_double:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldar x8, [x0]
+; CHECK-LSE-O1-NEXT:    fmov d0, x8
 ; CHECK-LSE-O1-NEXT:    ret
 ;
 ; CHECK-LSE-O0-LABEL: bitcast_to_double:
@@ -2859,6 +6542,12 @@ define internal float @bitcast_to_float(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    fmov s0, w8
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: bitcast_to_float:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar w8, [x0]
+; CHECK-OUTLINE-NEXT:    fmov s0, w8
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: bitcast_to_float:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar w8, [x0]
@@ -2883,6 +6572,13 @@ define internal half @bitcast_to_half(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    ; kill: def $h0 killed $h0 killed $s0
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: bitcast_to_half:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldarh w8, [x0]
+; CHECK-OUTLINE-NEXT:    fmov s0, w8
+; CHECK-OUTLINE-NEXT:    ; kill: def $h0 killed $h0 killed $s0
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: bitcast_to_half:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldarh w8, [x0]
@@ -2907,6 +6603,11 @@ define internal ptr @inttoptr(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    ldar x0, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: inttoptr:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar x0, [x0]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: inttoptr:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar x0, [x0]
@@ -2927,6 +6628,11 @@ define internal ptr @load_ptr(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    ldar x0, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: load_ptr:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar x0, [x0]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: load_ptr:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar x0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 9940f439dd714..df1c5ff040430 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -226,7 +226,6 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMICRMW_MIN (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMICRMW_UMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices

From ff0c1f20a744b93ab8a9f8b378c5225cf3f4f68f Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Thu, 4 Jan 2024 18:45:55 +0800
Subject: [PATCH 215/313] [CodeGen] Remove unused variables in
 TargetLoweringBase.cpp (NFC)

llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp:570:12: error: unused variable 'ModeN' [-Werror,-Wunused-variable]
  570 |   unsigned ModeN, ModelN;
      |            ^~~~~
llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp:570:19: error: unused variable 'ModelN' [-Werror,-Wunused-variable]
  570 |   unsigned ModeN, ModelN;
      |                   ^~~~~~
2 errors generated.
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index acbbfd9ddaf52..e92e3cd043918 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -567,7 +567,6 @@ RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4],
 
 RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
                                         MVT VT) {
-  unsigned ModeN, ModelN;
   if (!VT.isScalarInteger())
     return UNKNOWN_LIBCALL;
   uint64_t MemSize = VT.getScalarSizeInBits() / 8;

From 72db578d7149bc9941d45a800fb8be4f79ba24e2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 3 Jan 2024 13:57:40 +0000
Subject: [PATCH 216/313] [DAG] Fix typo in VSELECT SimplifyDemandedVectorElts
 handling. NFC.

Rename UndefZero -> UndefSel (undefined elements from Sel operand).
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 66cdd75259087..c28e8acd57a14 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3353,8 +3353,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
     // Try to transform the select condition based on the current demanded
     // elements.
-    APInt UndefSel, UndefZero;
-    if (SimplifyDemandedVectorElts(Sel, DemandedElts, UndefSel, UndefZero, TLO,
+    APInt UndefSel, ZeroSel;
+    if (SimplifyDemandedVectorElts(Sel, DemandedElts, UndefSel, ZeroSel, TLO,
                                    Depth + 1))
       return true;
 
@@ -3377,7 +3377,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     // select value element.
     APInt DemandedSel = DemandedElts & ~KnownZero;
     if (DemandedSel != DemandedElts)
-      if (SimplifyDemandedVectorElts(Sel, DemandedSel, UndefSel, UndefZero, TLO,
+      if (SimplifyDemandedVectorElts(Sel, DemandedSel, UndefSel, ZeroSel, TLO,
                                      Depth + 1))
         return true;
 

From 6bb8d69f2a66da179efec8e0e69fbf55a3296d1c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 3 Jan 2024 15:04:35 +0000
Subject: [PATCH 217/313] [X86] combineLoad - pull out repeated cast<MemSDNode>
 calls. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd56529bfa0fd..e0679f5f27d8c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49959,18 +49959,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
     SDValue Ptr = Ld->getBasePtr();
     SDValue Chain = Ld->getChain();
     for (SDNode *User : Chain->uses()) {
-      if (User != N &&
+      auto *UserLd = dyn_cast<MemSDNode>(User);
+      if (User != N && UserLd &&
           (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
            User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
            ISD::isNormalLoad(User)) &&
-          cast<MemSDNode>(User)->getChain() == Chain &&
-          !User->hasAnyUseOfValue(1) &&
+          UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
           User->getValueSizeInBits(0).getFixedValue() >
               RegVT.getFixedSizeInBits()) {
         if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
-            cast<MemSDNode>(User)->getBasePtr() == Ptr &&
-            cast<MemSDNode>(User)->getMemoryVT().getSizeInBits() ==
-                MemVT.getSizeInBits()) {
+            UserLd->getBasePtr() == Ptr &&
+            UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) {
           SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
                                              RegVT.getSizeInBits());
           Extract = DAG.getBitcast(RegVT, Extract);
@@ -49989,7 +49988,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
         // See if we are loading a constant that matches in the lower
         // bits of a longer constant (but from a different constant pool ptr).
         EVT UserVT = User->getValueType(0);
-        SDValue UserPtr = cast<MemSDNode>(User)->getBasePtr();
+        SDValue UserPtr = UserLd->getBasePtr();
         const Constant *LdC = getTargetConstantFromBasePtr(Ptr);
         const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
         if (LdC && UserC && UserPtr != Ptr) {

From 43e0723899e909cb2502b34da2003a5774ffb394 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 3 Jan 2024 17:24:47 +0000
Subject: [PATCH 218/313] [DAG] BaseIndexOffset::computeAliasing - early out on
 failed matches. NFCI.

Don't wait to test that all base ptr matches have succeeded
---
 .../CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp   | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 39a1e09e83c59..558706d3d3c56 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -91,10 +91,13 @@ bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
                                       const SelectionDAG &DAG, bool &IsAlias) {
 
   BaseIndexOffset BasePtr0 = match(Op0, DAG);
-  BaseIndexOffset BasePtr1 = match(Op1, DAG);
+  if (!BasePtr0.getBase().getNode())
+    return false;
 
-  if (!(BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()))
+  BaseIndexOffset BasePtr1 = match(Op1, DAG);
+  if (!BasePtr1.getBase().getNode())
     return false;
+
   int64_t PtrDiff;
   if (NumBytes0 && NumBytes1 &&
       BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {

From 5b38ecff6e9d7ef84ba8fd9b1b1e4c9b229dbdb5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Jan 2024 10:50:15 +0000
Subject: [PATCH 219/313] [DAG] BaseIndexOffset::equalBaseIndex - early out on
 failed matches. NFCI.

If we successfully cast only the first base node as GlobalAddressSDNode / ConstantPoolSDNode / FrameIndexSDNode then we can early out as we know that base won't cast as a later type.

Noticed while investigating profiles for potential compile time improvements.
---
 .../SelectionDAG/SelectionDAGAddressAnalysis.cpp     | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 558706d3d3c56..66825d845c191 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -38,15 +38,18 @@ bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
       return true;
 
     // Match GlobalAddresses
-    if (auto *A = dyn_cast<GlobalAddressSDNode>(Base))
+    if (auto *A = dyn_cast<GlobalAddressSDNode>(Base)) {
       if (auto *B = dyn_cast<GlobalAddressSDNode>(Other.Base))
         if (A->getGlobal() == B->getGlobal()) {
           Off += B->getOffset() - A->getOffset();
           return true;
         }
 
+      return false;
+    }
+
     // Match Constants
-    if (auto *A = dyn_cast<ConstantPoolSDNode>(Base))
+    if (auto *A = dyn_cast<ConstantPoolSDNode>(Base)) {
       if (auto *B = dyn_cast<ConstantPoolSDNode>(Other.Base)) {
         bool IsMatch =
             A->isMachineConstantPoolEntry() == B->isMachineConstantPoolEntry();
@@ -62,7 +65,8 @@ bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
         }
       }
 
-    const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      return false;
+    }
 
     // Match FrameIndexes.
     if (auto *A = dyn_cast<FrameIndexSDNode>(Base))
@@ -73,6 +77,7 @@ bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
         // Non-equal FrameIndexes - If both frame indices are fixed
         // we know their relative offsets and can compare them. Otherwise
         // we must be conservative.
+        const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
         if (MFI.isFixedObjectIndex(A->getIndex()) &&
             MFI.isFixedObjectIndex(B->getIndex())) {
           Off += MFI.getObjectOffset(B->getIndex()) -
@@ -81,6 +86,7 @@ bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
         }
       }
   }
+
   return false;
 }
 

From f45b75949d8ccc9890241ecf9b9ad11349e1d036 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Jan 2024 10:53:45 +0000
Subject: [PATCH 220/313] [DAG] SimplifyDemandedBits - call demanded elts
 variant directly for SELECT/SELECT_CC nodes.

Don't rebuild the demanded elts mask every time.
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c28e8acd57a14..f8400e8e94df6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1645,11 +1645,11 @@ bool TargetLowering::SimplifyDemandedBits(
     break;
   }
   case ISD::SELECT:
-    if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, Known, TLO,
-                             Depth + 1))
+    if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, DemandedElts,
+                             Known, TLO, Depth + 1))
       return true;
-    if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, Known2, TLO,
-                             Depth + 1))
+    if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedElts,
+                             Known2, TLO, Depth + 1))
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
@@ -1675,11 +1675,11 @@ bool TargetLowering::SimplifyDemandedBits(
     Known = Known.intersectWith(Known2);
     break;
   case ISD::SELECT_CC:
-    if (SimplifyDemandedBits(Op.getOperand(3), DemandedBits, Known, TLO,
-                             Depth + 1))
+    if (SimplifyDemandedBits(Op.getOperand(3), DemandedBits, DemandedElts,
+                             Known, TLO, Depth + 1))
       return true;
-    if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, Known2, TLO,
-                             Depth + 1))
+    if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, DemandedElts,
+                             Known2, TLO, Depth + 1))
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");

From f5efa74961560070a1e6f127214bcf6b570fef98 Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Thu, 4 Jan 2024 11:57:53 +0100
Subject: [PATCH 221/313] [Sema] When checking for constraint equivalence, do
 not calculate satisfaction (#74490)

... and only look at equivalence of substituted expressions, not results
of constraint satisfaction.
This is required by the standard when matching redeclarations.

Fixes #74314.

There is already some existing machinery for that in
`TemplateInstantiator` and `Sema` exposed separate functions for
substituting expressions with intention to do that:
- `Sema::SubstExpr` should not evaluate constraints.
- `Sema::SubstConstraintExpr` should.

However, both functions used to be equivalent. Introduce a new function
that does not evaluate constraint and use it when matching declarations.

Also change implementation of `SubstConstraintExpr` to call `SubstExpr`
directly so it's obvious they behave in the same way and add a FIXME to
call out that we might need to revamp this approach in the future.
---
 clang/include/clang/Sema/Sema.h               |  6 ++--
 clang/include/clang/Sema/Template.h           |  1 +
 clang/lib/Sema/SemaConcept.cpp                | 17 +++++-----
 clang/lib/Sema/SemaTemplateInstantiate.cpp    | 22 +++++++++++--
 .../SemaTemplate/concepts-out-of-line-def.cpp | 32 +++++++++++++++++++
 5 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5e3b57ea33220..8f44adef38159 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10263,11 +10263,13 @@ class Sema final {
     ~ConstraintEvalRAII() { TI.setEvaluateConstraints(OldValue); }
   };
 
-  // Unlike the above, this evaluates constraints, which should only happen at
-  // 'constraint checking' time.
+  // Must be used instead of SubstExpr at 'constraint checking' time.
   ExprResult
   SubstConstraintExpr(Expr *E,
                       const MultiLevelTemplateArgumentList &TemplateArgs);
+  // Unlike the above, this does not evaluates constraints.
+  ExprResult SubstConstraintExprWithoutSatisfaction(
+      Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs);
 
   /// Substitute the given template arguments into a list of
   /// expressions, expanding pack expansions if required.
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 2a553054a0ce5..ce44aca797b0f 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -564,6 +564,7 @@ enum class TemplateSubstitutionKind : char {
     const MultiLevelTemplateArgumentList &TemplateArgs;
     Sema::LateInstantiatedAttrVec* LateAttrs = nullptr;
     LocalInstantiationScope *StartingScope = nullptr;
+    // Whether to evaluate the C++20 constraints or simply substitute into them.
     bool EvaluateConstraints = true;
 
     /// A list of out-of-line class template partial
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 719c6aab74e01..acfc00f412540 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -771,10 +771,9 @@ namespace {
   };
 } // namespace
 
-static const Expr *
-SubstituteConstraintExpression(Sema &S,
-                               const Sema::TemplateCompareNewDeclInfo &DeclInfo,
-                               const Expr *ConstrExpr) {
+static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
+    Sema &S, const Sema::TemplateCompareNewDeclInfo &DeclInfo,
+    const Expr *ConstrExpr) {
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
       DeclInfo.getDecl(), DeclInfo.getLexicalDeclContext(), /*Final=*/false,
       /*Innermost=*/nullptr,
@@ -797,8 +796,8 @@ SubstituteConstraintExpression(Sema &S,
   std::optional<Sema::CXXThisScopeRAII> ThisScope;
   if (auto *RD = dyn_cast<CXXRecordDecl>(DeclInfo.getDeclContext()))
     ThisScope.emplace(S, const_cast<CXXRecordDecl *>(RD), Qualifiers());
-  ExprResult SubstConstr =
-      S.SubstConstraintExpr(const_cast<clang::Expr *>(ConstrExpr), MLTAL);
+  ExprResult SubstConstr = S.SubstConstraintExprWithoutSatisfaction(
+      const_cast<clang::Expr *>(ConstrExpr), MLTAL);
   if (SFINAE.hasErrorOccurred() || !SubstConstr.isUsable())
     return nullptr;
   return SubstConstr.get();
@@ -814,12 +813,14 @@ bool Sema::AreConstraintExpressionsEqual(const NamedDecl *Old,
   if (Old && !New.isInvalid() && !New.ContainsDecl(Old) &&
       Old->getLexicalDeclContext() != New.getLexicalDeclContext()) {
     if (const Expr *SubstConstr =
-            SubstituteConstraintExpression(*this, Old, OldConstr))
+            SubstituteConstraintExpressionWithoutSatisfaction(*this, Old,
+                                                              OldConstr))
       OldConstr = SubstConstr;
     else
       return false;
     if (const Expr *SubstConstr =
-            SubstituteConstraintExpression(*this, New, NewConstr))
+            SubstituteConstraintExpressionWithoutSatisfaction(*this, New,
+                                                              NewConstr))
       NewConstr = SubstConstr;
     else
       return false;
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index df6b40999e645..37e5b9cad08bc 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1190,6 +1190,7 @@ namespace {
     const MultiLevelTemplateArgumentList &TemplateArgs;
     SourceLocation Loc;
     DeclarationName Entity;
+    // Whether to evaluate the C++20 constraints or simply substitute into them.
     bool EvaluateConstraints = true;
 
   public:
@@ -2499,6 +2500,17 @@ TemplateInstantiator::TransformNestedRequirement(
       Req->getConstraintExpr()->getBeginLoc(), Req,
       Sema::InstantiatingTemplate::ConstraintsCheck{},
       Req->getConstraintExpr()->getSourceRange());
+  if (!getEvaluateConstraints()) {
+    ExprResult TransConstraint = TransformExpr(Req->getConstraintExpr());
+    if (TransConstraint.isInvalid() || !TransConstraint.get())
+      return nullptr;
+    if (TransConstraint.get()->isInstantiationDependent())
+      return new (SemaRef.Context)
+          concepts::NestedRequirement(TransConstraint.get());
+    ConstraintSatisfaction Satisfaction;
+    return new (SemaRef.Context) concepts::NestedRequirement(
+        SemaRef.Context, TransConstraint.get(), Satisfaction);
+  }
 
   ExprResult TransConstraint;
   ConstraintSatisfaction Satisfaction;
@@ -4093,13 +4105,19 @@ Sema::SubstExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) {
 ExprResult
 Sema::SubstConstraintExpr(Expr *E,
                           const MultiLevelTemplateArgumentList &TemplateArgs) {
+  // FIXME: should call SubstExpr directly if this function is equivalent or
+  //        should it be different?
+  return SubstExpr(E, TemplateArgs);
+}
+
+ExprResult Sema::SubstConstraintExprWithoutSatisfaction(
+    Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) {
   if (!E)
     return E;
 
-  // This is where we need to make sure we 'know' constraint checking needs to
-  // happen.
   TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(),
                                     DeclarationName());
+  Instantiator.setEvaluateConstraints(false);
   return Instantiator.TransformExpr(E);
 }
 
diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
index f134394615fb2..c4e8e6f720c49 100644
--- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
+++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
@@ -504,3 +504,35 @@ struct bar {
 bar<int> x;
 } // namespace GH61763
 
+
+namespace GH74314 {
+template <class T, class U> constexpr bool is_same_v = __is_same(T, U);
+template <class T, class U> constexpr bool is_not_same_v = !__is_same(T, U);
+
+template <class Result>
+concept something_interesting = requires {
+      true;
+      requires is_same_v<int, Result>;
+};
+
+template <class T>
+struct X {
+      void foo() requires requires { requires is_not_same_v<T, int>; };
+      void bar(decltype(requires { requires is_not_same_v<T, int>; }));
+};
+
+template <class T>
+void X<T>::foo() requires requires { requires something_interesting<T>; } {}
+// expected-error@-1{{definition of 'foo' does not match any declaration}}
+// expected-note@*{{}}
+
+template <class T>
+void X<T>::foo() requires requires { requires is_not_same_v<T, int>; } {} // ok
+
+template <class T>
+void X<T>::bar(decltype(requires { requires something_interesting<T>; })) {}
+// expected-error@-1{{definition of 'bar' does not match any declaration}}
+
+template <class T>
+void X<T>::bar(decltype(requires { requires is_not_same_v<T, int>; })) {}
+} // namespace GH74314

From 2336f792bc5a1d9195c1bd995b6040c13e73d4e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Thu, 4 Jan 2024 12:57:42 +0100
Subject: [PATCH 222/313] [clangd] Dont require confirmation for
 include-cleaner batch-fixes (#76826)

False negative/positive rate has decreased to the degree that these
extra confirmations no longer provide any value, but only create
friction in the happy case.

When we have false analysis, people usually need to apply the fixes and
run the builds to discover the failure. At that point they can add
relevant IWYU pragmas to guide analysis, and will be more likely to
create bug reports due to extra annoyance :)
---
 clang-tools-extra/clangd/IncludeCleaner.cpp   | 32 +--------
 .../test/include-cleaner-batch-fix.test       | 68 -------------------
 2 files changed, 1 insertion(+), 99 deletions(-)

diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index 563a1f5d22f5b..f86a121340f7f 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -48,7 +48,6 @@
 #include <cassert>
 #include <iterator>
 #include <map>
-#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -237,18 +236,6 @@ removeAllUnusedIncludes(llvm::ArrayRef<Diag> UnusedIncludes) {
                            Diag.Fixes.front().Edits.begin(),
                            Diag.Fixes.front().Edits.end());
   }
-
-  // TODO(hokein): emit a suitable text for the label.
-  ChangeAnnotation Annotation = {/*label=*/"",
-                                 /*needsConfirmation=*/true,
-                                 /*description=*/""};
-  static const ChangeAnnotationIdentifier RemoveAllUnusedID =
-      "RemoveAllUnusedIncludes";
-  for (unsigned I = 0; I < RemoveAll.Edits.size(); ++I) {
-    ChangeAnnotationIdentifier ID = RemoveAllUnusedID + std::to_string(I);
-    RemoveAll.Edits[I].annotationId = ID;
-    RemoveAll.Annotations.push_back({ID, Annotation});
-  }
   return RemoveAll;
 }
 
@@ -268,20 +255,8 @@ addAllMissingIncludes(llvm::ArrayRef<Diag> MissingIncludeDiags) {
       Edits.try_emplace(Edit.newText, Edit);
     }
   }
-  // FIXME(hokein): emit used symbol reference in the annotation.
-  ChangeAnnotation Annotation = {/*label=*/"",
-                                 /*needsConfirmation=*/true,
-                                 /*description=*/""};
-  static const ChangeAnnotationIdentifier AddAllMissingID =
-      "AddAllMissingIncludes";
-  unsigned I = 0;
-  for (auto &It : Edits) {
-    ChangeAnnotationIdentifier ID = AddAllMissingID + std::to_string(I++);
+  for (auto &It : Edits)
     AddAllMissing.Edits.push_back(std::move(It.second));
-    AddAllMissing.Edits.back().annotationId = ID;
-
-    AddAllMissing.Annotations.push_back({ID, Annotation});
-  }
   return AddAllMissing;
 }
 Fix fixAll(const Fix &RemoveAllUnused, const Fix &AddAllMissing) {
@@ -292,11 +267,6 @@ Fix fixAll(const Fix &RemoveAllUnused, const Fix &AddAllMissing) {
     FixAll.Edits.push_back(F);
   for (const auto &F : AddAllMissing.Edits)
     FixAll.Edits.push_back(F);
-
-  for (const auto &A : RemoveAllUnused.Annotations)
-    FixAll.Annotations.push_back(A);
-  for (const auto &A : AddAllMissing.Annotations)
-    FixAll.Annotations.push_back(A);
   return FixAll;
 }
 
diff --git a/clang-tools-extra/clangd/test/include-cleaner-batch-fix.test b/clang-tools-extra/clangd/test/include-cleaner-batch-fix.test
index af7cdba5b05f4..07ebe1009a78f 100644
--- a/clang-tools-extra/clangd/test/include-cleaner-batch-fix.test
+++ b/clang-tools-extra/clangd/test/include-cleaner-batch-fix.test
@@ -157,21 +157,10 @@
 # CHECK-NEXT:    {
 # CHECK-NEXT:      "arguments": [
 # CHECK-NEXT:        {
-# CHECK-NEXT:          "changeAnnotations": {
-# CHECK-NEXT:            "AddAllMissingIncludes0": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            },
-# CHECK-NEXT:            "AddAllMissingIncludes1": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            }
-# CHECK-NEXT:          },
 # CHECK-NEXT:          "documentChanges": [
 # CHECK-NEXT:            {
 # CHECK-NEXT:              "edits": [
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "AddAllMissingIncludes0",
 # CHECK-NEXT:                  "newText": "#include {{.*}}bar.h{{.*}}",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -185,7 +174,6 @@
 # CHECK-NEXT:                  }
 # CHECK-NEXT:                },
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "AddAllMissingIncludes1",
 # CHECK-NEXT:                  "newText": "#include {{.*}}foo.h{{.*}}",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -213,29 +201,10 @@
 # CHECK-NEXT:    {
 # CHECK-NEXT:      "arguments": [
 # CHECK-NEXT:        {
-# CHECK-NEXT:          "changeAnnotations": {
-# CHECK-NEXT:            "AddAllMissingIncludes0": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            },
-# CHECK-NEXT:            "AddAllMissingIncludes1": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            },
-# CHECK-NEXT:            "RemoveAllUnusedIncludes0": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            },
-# CHECK-NEXT:            "RemoveAllUnusedIncludes1": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            }
-# CHECK-NEXT:          },
 # CHECK-NEXT:          "documentChanges": [
 # CHECK-NEXT:            {
 # CHECK-NEXT:              "edits": [
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "RemoveAllUnusedIncludes0",
 # CHECK-NEXT:                  "newText": "",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -249,7 +218,6 @@
 # CHECK-NEXT:                  }
 # CHECK-NEXT:                },
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "RemoveAllUnusedIncludes1",
 # CHECK-NEXT:                  "newText": "",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -263,7 +231,6 @@
 # CHECK-NEXT:                  }
 # CHECK-NEXT:                },
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "AddAllMissingIncludes0",
 # CHECK-NEXT:                  "newText": "#include {{.*}}bar.h{{.*}}",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -277,7 +244,6 @@
 # CHECK-NEXT:                  }
 # CHECK-NEXT:                },
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "AddAllMissingIncludes1",
 # CHECK-NEXT:                  "newText": "#include {{.*}}foo.h{{.*}}",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -342,21 +308,10 @@
 # CHECK-NEXT:    {
 # CHECK-NEXT:      "arguments": [
 # CHECK-NEXT:        {
-# CHECK-NEXT:          "changeAnnotations": {
-# CHECK-NEXT:            "RemoveAllUnusedIncludes0": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            },
-# CHECK-NEXT:            "RemoveAllUnusedIncludes1": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            }
-# CHECK-NEXT:          },
 # CHECK-NEXT:          "documentChanges": [
 # CHECK-NEXT:            {
 # CHECK-NEXT:              "edits": [
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "RemoveAllUnusedIncludes0",
 # CHECK-NEXT:                  "newText": "",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -370,7 +325,6 @@
 # CHECK-NEXT:                  }
 # CHECK-NEXT:                },
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "RemoveAllUnusedIncludes1",
 # CHECK-NEXT:                  "newText": "",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -398,29 +352,10 @@
 # CHECK-NEXT:    {
 # CHECK-NEXT:      "arguments": [
 # CHECK-NEXT:        {
-# CHECK-NEXT:          "changeAnnotations": {
-# CHECK-NEXT:            "AddAllMissingIncludes0": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            },
-# CHECK-NEXT:            "AddAllMissingIncludes1": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            },
-# CHECK-NEXT:            "RemoveAllUnusedIncludes0": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            },
-# CHECK-NEXT:            "RemoveAllUnusedIncludes1": {
-# CHECK-NEXT:              "label": "",
-# CHECK-NEXT:              "needsConfirmation": true
-# CHECK-NEXT:            }
-# CHECK-NEXT:          },
 # CHECK-NEXT:          "documentChanges": [
 # CHECK-NEXT:            {
 # CHECK-NEXT:              "edits": [
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "RemoveAllUnusedIncludes0",
 # CHECK-NEXT:                  "newText": "",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -434,7 +369,6 @@
 # CHECK-NEXT:                  }
 # CHECK-NEXT:                },
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "RemoveAllUnusedIncludes1",
 # CHECK-NEXT:                  "newText": "",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -448,7 +382,6 @@
 # CHECK-NEXT:                  }
 # CHECK-NEXT:                },
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "AddAllMissingIncludes0",
 # CHECK-NEXT:                  "newText": "#include {{.*}}bar.h{{.*}}",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {
@@ -462,7 +395,6 @@
 # CHECK-NEXT:                  }
 # CHECK-NEXT:                },
 # CHECK-NEXT:                {
-# CHECK-NEXT:                  "annotationId": "AddAllMissingIncludes1",
 # CHECK-NEXT:                  "newText": "#include {{.*}}foo.h{{.*}}",
 # CHECK-NEXT:                  "range": {
 # CHECK-NEXT:                    "end": {

From 202a4c0dfb19823a0c0fc737e32d205efaffb7ff Mon Sep 17 00:00:00 2001
From: Christian Sigg <chsigg@users.noreply.github.com>
Date: Thu, 4 Jan 2024 13:17:20 +0100
Subject: [PATCH 223/313] [mlir][bazel] Fix BUILD after
 6ae7f66ff5169ddc5a7b9ab545707042c77e036c

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2a72bf965e544..7a4495e28caed 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7629,10 +7629,10 @@ cc_library(
         ":Pass",
         ":Rewrite",
         ":RuntimeVerifiableOpInterface",
-        ":SideEffectInterfaces",
         ":Support",
         ":TransformUtils",
         ":TransformsPassIncGen",
+        ":config",
         "//llvm:Support",
     ],
 )

From e147dcbcbc8f92b7f4973eaebe800308f480dd84 Mon Sep 17 00:00:00 2001
From: HaohaiWen <haohai.wen@intel.com>
Date: Thu, 4 Jan 2024 20:49:11 +0800
Subject: [PATCH 224/313] [SEH] Add test to track EHa register liveness
 verification (#76921)

This test tracks bug of MachineVerifier to check live range segment for
EHa. Async exception can happen at any place within seh scope, not only
the call instruction. Need to teach MachineVerifier to know that.
---
 .../X86/windows-seh-EHa-RegisterLiveness.ll   | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/windows-seh-EHa-RegisterLiveness.ll

diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-RegisterLiveness.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-RegisterLiveness.ll
new file mode 100644
index 0000000000000..ff07f4ddf0054
--- /dev/null
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-RegisterLiveness.ll
@@ -0,0 +1,66 @@
+; XFAIL: *
+; RUN: llc --verify-machineinstrs < %s
+source_filename = "test.cpp"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.12.0"
+
+$"?test@Test@@Plugin@@Host@@@Z" = comdat any
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind memory(none)
+declare void @llvm.seh.scope.begin() #1
+
+; Function Attrs: nobuiltin allocsize(0)
+declare ptr @"??2@Test@Z"(i64) #1
+
+; Function Attrs: nounwind memory(none)
+declare void @llvm.seh.scope.end() #0
+
+; Function Attrs: nobuiltin nounwind
+declare void @"??3@YAXPEAX@Z"(ptr) #2
+
+; Function Attrs: mustprogress uwtable
+define ptr @"?test@Test@@Plugin@@Host@@@Z"(ptr %this, ptr %host) #3 comdat align 2 personality ptr @__CxxFrameHandler3 {
+entry:
+  %host.addr = alloca ptr, align 8
+  %this.addr = alloca ptr, align 8
+  store ptr %host, ptr %host.addr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  %call = call noalias ptr @"??2@Test@Z"(i64 152) #5
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  %call3 = invoke ptr @"??Test@?A0x2749C4FD@@QEAA@Test@Test@@@Z"(ptr %call, ptr %this1)
+          to label %invoke.cont2 unwind label %ehcleanup
+
+invoke.cont2:                                     ; preds = %invoke.cont
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont4 unwind label %ehcleanup
+
+invoke.cont4:                                     ; preds = %invoke.cont2
+  ret ptr %call
+
+ehcleanup:                                        ; preds = %invoke.cont2, %invoke.cont, %entry
+  %0 = cleanuppad within none []
+  call void @"??3@YAXPEAX@Z"(ptr %call) #6 [ "funclet"(token %0) ]
+  cleanupret from %0 unwind to caller
+}
+
+; Function Attrs: uwtable
+declare hidden ptr @"??Test@?A0x2749C4FD@@QEAA@Test@Test@@@Z"(ptr, ptr) #4 align 2
+
+attributes #0 = { nounwind memory(none) }
+attributes #1 = { nobuiltin allocsize(0) "target-cpu"="x86-64" "target-features"="+cmov,+crc32,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="generic" }
+attributes #2 = { nobuiltin nounwind "target-cpu"="x86-64" "target-features"="+cmov,+crc32,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="generic" }
+attributes #3 = { mustprogress uwtable "target-cpu"="x86-64" "target-features"="+cmov,+crc32,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="generic" }
+attributes #4 = { uwtable "target-cpu"="x86-64" "target-features"="+cmov,+crc32,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="generic" }
+attributes #5 = { builtin allocsize(0) }
+attributes #6 = { builtin nounwind }
+
+!llvm.module.flags = !{!1, !2}
+
+!1 = !{i32 2, !"eh-asynch", i32 1}
+!2 = !{i32 7, !"uwtable", i32 2}

From ca5d34ec7186f2b5750c7e67dcb8b2d0dc865d8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 4 Jan 2024 12:54:16 +0000
Subject: [PATCH 225/313] [mlir][TD] Fix the order of return handles (#76929)

Replace (in tests and docs):

    %forall, %tiled = transform.structured.tile_using_forall

with (updated order of return handles):

    %tiled, %forall = transform.structured.tile_using_forall

Similar change is applied to (in the TD tutorial):

    transform.structured.fuse_into_containing_op

This update makes sure that the tests/documentation are consistent with
the Op specifications. Follow-up for #67320 which updated the order of
the return handles for `tile_using_forall`.
---
 mlir/docs/Tutorials/transform/Ch1.md             | 16 ++++++++--------
 .../Linalg/TransformOps/LinalgTransformOps.td    |  4 ++--
 mlir/test/Dialect/GPU/transform-gpu-failing.mlir |  2 +-
 mlir/test/Dialect/Linalg/tile-to-forall.mlir     |  4 ++--
 .../Examples/transform/Ch1/invalidation-1.mlir   |  4 ++--
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mlir/docs/Tutorials/transform/Ch1.md b/mlir/docs/Tutorials/transform/Ch1.md
index 0df25a5fbbdca..8d9dfac18b531 100644
--- a/mlir/docs/Tutorials/transform/Ch1.md
+++ b/mlir/docs/Tutorials/transform/Ch1.md
@@ -261,7 +261,7 @@ transform.sequence failures(propagate) {
 
   // The actual tiling transformation takes tile sizes as attributes. It
   // produces a handle to the loop generated during tiling.
-  %loop, %tiled_max =
+  %tiled_max, %loop =
       transform.structured.tile_using_forall %max tile_sizes [8, 32]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
@@ -271,12 +271,12 @@ transform.sequence failures(propagate) {
   // important. We could also use "transform.merge_handles" to obtain a single
   // handle to all operations and give it to `fuse_into_containing_op` that
   // would take care of the ordering in this case.
-  %loop_0, %tiled_add =
+  %add_fused, %loop_0 =
       transform.structured.fuse_into_containing_op %add into %loop
         : (!transform.any_op, !transform.any_op)
           -> (!transform.any_op, !transform.any_op)
-  %loop_1, %tiled_matmul =
-      transform.structured.fuse_into_containing_op %arg1 into %loop
+  %matmul_fused, %loop_1 =
+      transform.structured.fuse_into_containing_op %arg1 into %loop_0
         : (!transform.op<"linalg.matmul">, !transform.any_op)
           -> (!transform.any_op, !transform.any_op)
 
@@ -304,7 +304,7 @@ transform.sequence failures(propagate) {
 
   // The actual tiling transformation takes tile sizes as attributes. It
   // produces a handle to the loop generated during tiling.
-  %loop, %tiled = transform.structured.tile_using_forall %max tile_sizes [8, 32]
+  %tiled, %loop  = transform.structured.tile_using_forall %max tile_sizes [8, 32]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
   // We can now fuse the other operations into the loop. Here, we fuse
@@ -318,7 +318,7 @@ transform.sequence failures(propagate) {
         : (!transform.any_op, !transform.any_op)
           -> (!transform.any_op, !transform.any_op)
   %matmul_fused, %loop_1 =
-      transform.structured.fuse_into_containing_op %arg1 into %loop
+      transform.structured.fuse_into_containing_op %arg1 into %loop_0
         : (!transform.op<"linalg.matmul">, !transform.any_op)
           -> (!transform.any_op, !transform.any_op)
 
@@ -327,7 +327,7 @@ transform.sequence failures(propagate) {
   // "max" operation. This illustrates the precise targeting with the transform
   // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
   // of which having the same kind.
-  %loop_2, %tiled_2 =
+  %tiled_2, %loop_2 =
       transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   %matmul_fused_2, %loop_3 =
@@ -338,7 +338,7 @@ transform.sequence failures(propagate) {
   // Since outlining is currently only implemented for region-holding operations
   // such as loops, use tiling to size 1 to materialize the outer loop that is
   // going to be outlined.
-  %outline_target, %_ =
+  %_, %outline_target =
       transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   transform.structured.fuse_into_containing_op %matmul_fused_2
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 77ed9db5e71bd..bc257d17483e3 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1911,8 +1911,8 @@ def TileUsingForallOp :
     tiled operations, which can all be empty.
 
     These two returned handles point to:
-      - the new scf.forall op,
-      - the tiled op that implements TilingInterface.
+      - the tiled op that implements TilingInterface,
+      - the new scf.forall op.
 
     #### Example using `num_threads`
 
diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
index f81f8b64afdfc..8d7a1aa2a55fc 100644
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -144,7 +144,7 @@ func.func @map_nested_forall_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: te
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %forall, %tiled = transform.structured.tile_using_forall %matmul num_threads [2, 3, 1] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
+    %tiled, %forall = transform.structured.tile_using_forall %matmul num_threads [2, 3, 1] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{only bufferized scf.forall can be mapped}}
diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
index 38742028e4810..2192d160b1150 100644
--- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
@@ -389,7 +389,7 @@ module attributes {transform.with_named_sequence} {
   module attributes {transform.with_named_sequence} {
     transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [7]
+      %tiled_generic, %forall = transform.structured.tile_using_forall %0 num_threads [7]
            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
     }
@@ -445,7 +445,7 @@ module attributes {transform.with_named_sequence} {
   module attributes {transform.with_named_sequence} {
     transform.named_sequence @__transform_main(%IN_MAT2: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.generic"]} in %IN_MAT2 : (!transform.any_op) -> !transform.any_op
-      %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [4]
+      %tiled_generic, %forall = transform.structured.tile_using_forall %0 num_threads [4]
            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
     }
diff --git a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
index 1b71cfbe9b360..825e2abe48229 100644
--- a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
+++ b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
@@ -19,7 +19,7 @@ transform.sequence failures(propagate) {
      %arg2: !transform.op<"linalg.elemwise_binary">):
   // The actual tiling transformation takes tile sizes as attributes.
   // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
+  %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
       : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
 
   // This is trying to use an invalidated handle leading to undefined behavior.
@@ -64,7 +64,7 @@ transform.sequence failures(propagate) {
 
   // The actual tiling transformation takes tile sizes as attributes.
   // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
+  %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
     : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
 
   // Consuming an operand invalidates the consumed handle and any other handle that is

From 71b3ead870107e39e998f6480e545eb01d9d28be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 4 Jan 2024 14:59:49 +0200
Subject: [PATCH 226/313] [clang][dataflow] Remove a redundant trailing
 semicolon. NFC.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This silences the following warning with GCC:

    llvm-project/llvm/tools/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp:89:4: warning: extra ‘;’ [-Wpedantic]
       89 |   };
          |    ^
          |    -
---
 .../Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 8d481788af208..fe5ba5ab5426f 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -86,7 +86,7 @@ class DataflowAnalysisTest : public Test {
     const std::optional<StateT> &MaybeState = BlockStates[Block->getBlockID()];
     assert(MaybeState.has_value());
     return *MaybeState;
-  };
+  }
 
   std::unique_ptr<ASTUnit> AST;
   std::unique_ptr<ControlFlowContext> CFCtx;

From f8122518750e3563a79df22d72c26c3c922f63f2 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 4 Jan 2024 14:04:15 +0100
Subject: [PATCH 227/313] [ConstraintElim] Use SCEV to check for multiples
 (#76925)

When adding constraints for induction variables, if the step is not one,
we need to make sure that (end-start) is a multiple of step, otherwise
we might step over the end value.

Currently this only supports one specific pattern for pointers, where
the end is a gep of the start with an appropriate offset.

Generalize this by using SCEV to check for multiples, which also makes
this work for integer IVs.
---
 .../Scalar/ConstraintElimination.cpp          | 21 ++++---------------
 .../monotonic-int-phis-multiples.ll           | 15 +++++--------
 2 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index cc93c8617c05e..5e57aa7817516 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -968,23 +968,10 @@ void State::addInfoForInductions(BasicBlock &BB) {
     return;
 
   if (!StepOffset.isOne()) {
-    auto *UpperGEP = dyn_cast<GetElementPtrInst>(B);
-    if (!UpperGEP || UpperGEP->getPointerOperand() != StartValue ||
-        !UpperGEP->isInBounds())
-      return;
-
-    MapVector<Value *, APInt> UpperVariableOffsets;
-    APInt UpperConstantOffset(StepOffset.getBitWidth(), 0);
-    const DataLayout &DL = BB.getModule()->getDataLayout();
-    if (!UpperGEP->collectOffset(DL, StepOffset.getBitWidth(),
-                                 UpperVariableOffsets, UpperConstantOffset))
-      return;
-    // All variable offsets and the constant offset have to be a multiple of the
-    // step.
-    if (!UpperConstantOffset.urem(StepOffset).isZero() ||
-        any_of(UpperVariableOffsets, [&StepOffset](const auto &P) {
-          return !P.second.urem(StepOffset).isZero();
-        }))
+    // Check whether B-Start is known to be a multiple of StepOffset.
+    const SCEV *BMinusStart = SE.getMinusSCEV(SE.getSCEV(B), StartSCEV);
+    if (isa<SCEVCouldNotCompute>(BMinusStart) ||
+        !SE.getConstantMultiple(BMinusStart).urem(StepOffset).isZero())
       return;
   }
 
diff --git a/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-multiples.ll b/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-multiples.ll
index 035cea4d73246..a952c4048b450 100644
--- a/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-multiples.ll
+++ b/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-multiples.ll
@@ -13,8 +13,7 @@ define void @multiple_pow2(i64 %count) {
 ; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
 ; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK-NEXT:    br i1 true, label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -48,8 +47,7 @@ define void @multiple_pow2_larger_than_needed(i64 %count) {
 ; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
 ; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK-NEXT:    br i1 true, label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -119,8 +117,7 @@ define void @multiple_pow2_start_offset(i64 %count) {
 ; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
 ; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK-NEXT:    br i1 true, label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -194,8 +191,7 @@ define void @multiple_pow2_start_offset_dynamic(i64 %count) {
 ; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
 ; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK-NEXT:    br i1 true, label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -231,8 +227,7 @@ define void @multiple_non_pow2_nuw(i64 %count) {
 ; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i64 [[IV]], [[END]]
 ; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp ult i64 [[IV]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP2_I_I]], label [[LOOP]], label [[EXIT]]
+; CHECK-NEXT:    br i1 true, label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

From 475890cd2e65d0e9fbd37a912cd359f12f1f7668 Mon Sep 17 00:00:00 2001
From: Shih-Po Hung <shihpo.hung@sifive.com>
Date: Thu, 4 Jan 2024 21:04:36 +0800
Subject: [PATCH 228/313] [RISCV][CostModel] Add getRISCVInstructionCost() to
 TTI for CostKind (#76793)

Instruction cost for CodeSize and Latency/RecipThroughput can be very
different. Considering the diversity of CostKind and vendor-specific
cost, and how they are spread across various TTI functions, it's
becoming quite a challenge to handle. This patch adds an interface
getRISCVInstructionCost to address it.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  12 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   3 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 114 ++++++++++--
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   3 +
 .../Analysis/CostModel/RISCV/rvv-shuffle.ll   |  60 +++++++
 .../CostModel/RISCV/shuffle-broadcast.ll      |  99 +++++++++++
 .../CostModel/RISCV/shuffle-insert.ll         |  36 ++++
 .../RISCV/shuffle-insert_subvector.ll         | 166 ++++++++++++++++++
 .../CostModel/RISCV/shuffle-permute.ll        |  50 ++++++
 .../CostModel/RISCV/shuffle-reverse.ll        |  26 +++
 .../CostModel/RISCV/shuffle-select.ll         |  20 +++
 .../CostModel/RISCV/shuffle-transpose.ll      |  97 ++++++++++
 llvm/test/Analysis/CostModel/RISCV/splice.ll  |  53 ++++++
 13 files changed, 723 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c8a94adcd91c6..ae0e6ef101329 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2711,11 +2711,19 @@ InstructionCost RISCVTargetLowering::getVRGatherVICost(MVT VT) const {
   return getLMULCost(VT);
 }
 
-/// Return the cost of a vslidedown.vi/vx or vslideup.vi/vx instruction
+/// Return the cost of a vslidedown.vx or vslideup.vx instruction
 /// for the type VT.  (This does not cover the vslide1up or vslide1down
 /// variants.)  Slides may be linear in the number of vregs implied by LMUL,
 /// or may track the vrgather.vv cost. It is implementation-dependent.
-InstructionCost RISCVTargetLowering::getVSlideCost(MVT VT) const {
+InstructionCost RISCVTargetLowering::getVSlideVXCost(MVT VT) const {
+  return getLMULCost(VT);
+}
+
+/// Return the cost of a vslidedown.vi or vslideup.vi instruction
+/// for the type VT.  (This does not cover the vslide1up or vslide1down
+/// variants.)  Slides may be linear in the number of vregs implied by LMUL,
+/// or may track the vrgather.vv cost. It is implementation-dependent.
+InstructionCost RISCVTargetLowering::getVSlideVICost(MVT VT) const {
   return getLMULCost(VT);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index eacae8a700584..226b51ddda6e4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -525,7 +525,8 @@ class RISCVTargetLowering : public TargetLowering {
 
   InstructionCost getVRGatherVVCost(MVT VT) const;
   InstructionCost getVRGatherVICost(MVT VT) const;
-  InstructionCost getVSlideCost(MVT VT) const;
+  InstructionCost getVSlideVXCost(MVT VT) const;
+  InstructionCost getVSlideVICost(MVT VT) const;
 
   // Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 4614446b2150b..b3916c9870051 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -34,6 +34,65 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
+InstructionCost
+RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
+                                      TTI::TargetCostKind CostKind) {
+  size_t NumInstr = OpCodes.size();
+  if (CostKind == TTI::TCK_CodeSize)
+    return NumInstr;
+  InstructionCost LMULCost = TLI->getLMULCost(VT);
+  if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
+    return LMULCost * NumInstr;
+  InstructionCost Cost = 0;
+  for (auto Op : OpCodes) {
+    switch (Op) {
+    case RISCV::VRGATHER_VI:
+      Cost += TLI->getVRGatherVICost(VT);
+      break;
+    case RISCV::VRGATHER_VV:
+      Cost += TLI->getVRGatherVVCost(VT);
+      break;
+    case RISCV::VSLIDEUP_VI:
+    case RISCV::VSLIDEDOWN_VI:
+      Cost += TLI->getVSlideVICost(VT);
+      break;
+    case RISCV::VSLIDEUP_VX:
+    case RISCV::VSLIDEDOWN_VX:
+      Cost += TLI->getVSlideVXCost(VT);
+      break;
+    case RISCV::VREDMAX_VS:
+    case RISCV::VREDMIN_VS:
+    case RISCV::VREDMAXU_VS:
+    case RISCV::VREDMINU_VS:
+    case RISCV::VREDSUM_VS:
+    case RISCV::VREDAND_VS:
+    case RISCV::VREDOR_VS:
+    case RISCV::VREDXOR_VS:
+    case RISCV::VFREDMAX_VS:
+    case RISCV::VFREDMIN_VS:
+    case RISCV::VFREDUSUM_VS: {
+      unsigned VL = VT.getVectorMinNumElements();
+      if (!VT.isFixedLengthVector())
+        VL *= *getVScaleForTuning();
+      Cost += Log2_32_Ceil(VL);
+      break;
+    }
+    case RISCV::VFREDOSUM_VS: {
+      unsigned VL = VT.getVectorMinNumElements();
+      if (!VT.isFixedLengthVector())
+        VL *= *getVScaleForTuning();
+      Cost += VL;
+      break;
+    }
+    case RISCV::VMV_S_X:
+      // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1
+    default:
+      Cost += LMULCost;
+    }
+  }
+  return Cost;
+}
+
 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                             TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy() &&
@@ -281,7 +340,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
             // Example sequence:
             //   vnsrl.wi   v10, v8, 0
             if (equal(DeinterleaveMask, Mask))
-              return LT.first * TLI->getLMULCost(LT.second);
+              return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
+                                                        LT.second, CostKind);
           }
         }
       }
@@ -292,7 +352,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
            LT.second.getVectorNumElements() <= 256)) {
         VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
         InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
-        return IndexCost + TLI->getVRGatherVVCost(LT.second);
+        return IndexCost +
+               getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
       }
       [[fallthrough]];
     }
@@ -310,7 +371,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
         InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
         InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
-        return 2 * IndexCost + 2 * TLI->getVRGatherVVCost(LT.second) + MaskCost;
+        return 2 * IndexCost +
+               getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
+                                       LT.second, CostKind) +
+               MaskCost;
       }
       [[fallthrough]];
     }
@@ -365,19 +429,24 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     // Example sequence:
     // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
     // vslidedown.vi  v8, v9, 2
-    return LT.first * TLI->getVSlideCost(LT.second);
+    return LT.first *
+           getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
   case TTI::SK_InsertSubvector:
     // Example sequence:
     // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
     // vslideup.vi  v8, v9, 2
-    return LT.first * TLI->getVSlideCost(LT.second);
+    return LT.first *
+           getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
   case TTI::SK_Select: {
     // Example sequence:
     // li           a0, 90
     // vsetivli     zero, 8, e8, mf2, ta, ma (ignored)
     // vmv.s.x      v0, a0
     // vmerge.vvm   v8, v9, v8, v0
-    return LT.first * 3 * TLI->getLMULCost(LT.second);
+    return LT.first *
+           (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for li
+            getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
+                                    LT.second, CostKind));
   }
   case TTI::SK_Broadcast: {
     bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
@@ -389,7 +458,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)
         //   vmv.v.x v8, a0
         //   vmsne.vi v0, v8, 0
-        return LT.first * TLI->getLMULCost(LT.second) * 3;
+        return LT.first *
+               (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi
+                getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
+                                        LT.second, CostKind));
       }
       // Example sequence:
       //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)
@@ -400,24 +472,38 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       //   vmv.v.x v8, a0
       //   vmsne.vi  v0, v8, 0
 
-      return LT.first * TLI->getLMULCost(LT.second) * 6;
+      return LT.first *
+             (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi
+              TLI->getLMULCost(
+                  LT.second) + // FIXME: vmv.x.s is the same as extractelement
+              getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
+                                       RISCV::VMV_V_X, RISCV::VMSNE_VI},
+                                      LT.second, CostKind));
     }
 
     if (HasScalar) {
       // Example sequence:
       //   vmv.v.x v8, a0
-      return LT.first * TLI->getLMULCost(LT.second);
+      return LT.first *
+             getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
     }
 
     // Example sequence:
     //   vrgather.vi     v9, v8, 0
-    return LT.first * TLI->getVRGatherVICost(LT.second);
+    return LT.first *
+           getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
   }
-  case TTI::SK_Splice:
+  case TTI::SK_Splice: {
     // vslidedown+vslideup.
     // TODO: Multiplying by LT.first implies this legalizes into multiple copies
     // of similar code, but I think we expand through memory.
-    return 2 * LT.first * TLI->getVSlideCost(LT.second);
+    unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
+    if (Index >= 0 && Index < 32)
+      Opcodes[0] = RISCV::VSLIDEDOWN_VI;
+    else if (Index < 0 && Index > -32)
+      Opcodes[1] = RISCV::VSLIDEUP_VI;
+    return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+  }
   case TTI::SK_Reverse: {
     // TODO: Cases to improve here:
     // * Illegal vector types
@@ -437,7 +523,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     if (LT.second.isFixedLengthVector())
       // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
       LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
-    InstructionCost GatherCost = 2 + TLI->getVRGatherVVCost(LT.second);
+    // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
+    InstructionCost GatherCost =
+        2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
     // Mask operation additionally required extend and truncate
     InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
     return LT.first * (LenCost + GatherCost + ExtendCost);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 4c955744b37df..7e5dbddb5b519 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -48,6 +48,9 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   /// actual target hardware.
   unsigned getEstimatedVLFor(VectorType *Ty);
 
+  InstructionCost getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
+                                          TTI::TargetCostKind CostKind);
+
   /// Return the cost of accessing a constant pool entry of the specified
   /// type.
   InstructionCost getConstantPoolLoadCost(Type *Ty,
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index e6e0a4c7ae8fb..bd9f6af89a5cd 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -2,6 +2,7 @@
 ; Check getShuffleCost for scalable vector
 
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+m,+v < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+m,+v -cost-kind=code-size < %s | FileCheck %s --check-prefix=SIZE
 
 define void  @vector_broadcast() {
 ; CHECK-LABEL: 'vector_broadcast'
@@ -18,6 +19,21 @@ define void  @vector_broadcast() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %10 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %11 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'vector_broadcast'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zero = shufflevector <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = shufflevector <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = shufflevector <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = shufflevector <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %9 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %10 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %11 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %zero = shufflevector <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
   %1 = shufflevector <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -41,6 +57,13 @@ define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'vector_insert_extract'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
   %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
@@ -73,6 +96,26 @@ define void @vector_reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'vector_reverse'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
   %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
@@ -130,6 +173,23 @@ define void @vector_splice() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'vector_splice'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
   %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
index e281e2f8cc018..432b90d9305af 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
 
 define void  @broadcast_scalable() #0{
 ; CHECK-LABEL: 'broadcast_scalable'
@@ -48,6 +49,53 @@ define void  @broadcast_scalable() #0{
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'broadcast_scalable'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zero = shufflevector <vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector <vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = shufflevector <vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = shufflevector <vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = shufflevector <vscale x 32 x half> undef, <vscale x 32 x half> undef, <vscale x 32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = shufflevector <vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = shufflevector <vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = shufflevector <vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = shufflevector <vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = shufflevector <vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = shufflevector <vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = shufflevector <vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = shufflevector <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = shufflevector <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = shufflevector <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = shufflevector <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = shufflevector <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = shufflevector <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = shufflevector <vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = shufflevector <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = shufflevector <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = shufflevector <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = shufflevector <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = shufflevector <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = shufflevector <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = shufflevector <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = shufflevector <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = shufflevector <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = shufflevector <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = shufflevector <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = shufflevector <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = shufflevector <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = shufflevector <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = shufflevector <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %37 = shufflevector <vscale x 1 x i1> undef, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %zero = shufflevector <vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i32> zeroinitializer
   %1 = shufflevector <vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
@@ -154,6 +202,57 @@ define void  @broadcast_fixed() #0{
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'broadcast_fixed'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
   %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll
index f5ec662519615..0f0c37cb6ae43 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v | FileCheck %s -check-prefixes=CHECK,RV32
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v | FileCheck %s -check-prefixes=CHECK,RV64
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v | FileCheck %s -check-prefixes=CHECK-SIZE,RV32-SIZE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv64 -mattr=+v | FileCheck %s -check-prefixes=CHECK-SIZE,RV64-SIZE
 
 define <8 x i8> @insert_subvector_middle_v8i8(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: 'insert_subvector_middle_v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
+;
+; CHECK-SIZE-LABEL: 'insert_subvector_middle_v8i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 6, i32 7>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res
 ;
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 6, i32 7>
   ret <8 x i8> %res
@@ -15,6 +21,10 @@ define <8 x i8> @insert_subvector_end_v8i8(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: 'insert_subvector_end_v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
+;
+; CHECK-SIZE-LABEL: 'insert_subvector_end_v8i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res
 ;
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i8> %res
@@ -24,6 +34,10 @@ define <8 x i8> @insert_subvector_end_swapped_v8i8(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: 'insert_subvector_end_swapped_v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
+;
+; CHECK-SIZE-LABEL: 'insert_subvector_end_swapped_v8i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res
 ;
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i8> %res
@@ -33,6 +47,10 @@ define <8 x i8> @insert_subvector_short_v8i8(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: 'insert_subvector_short_v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
+;
+; CHECK-SIZE-LABEL: 'insert_subvector_short_v8i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res
 ;
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
   ret <8 x i8> %res
@@ -42,6 +60,10 @@ define <8 x i8> @insert_subvector_offset_1_v8i8(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: 'insert_subvector_offset_1_v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
+;
+; CHECK-SIZE-LABEL: 'insert_subvector_offset_1_v8i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res
 ;
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
@@ -51,6 +73,10 @@ define <8 x i64> @insert_subvector_offset_1_v8i64(<8 x i64> %v, <8 x i64> %w) {
 ; CHECK-LABEL: 'insert_subvector_offset_1_v8i64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %res
+;
+; CHECK-SIZE-LABEL: 'insert_subvector_offset_1_v8i64'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res
 ;
   %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
   ret <8 x i64> %res
@@ -61,6 +87,10 @@ define <12 x i8> @insert_subvector_concat_v6i8(<6 x i8> %x, <6 x i8> %y) {
 ; CHECK-LABEL: 'insert_subvector_concat_v6i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = shufflevector <6 x i8> %x, <6 x i8> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <12 x i8> %a
+;
+; CHECK-SIZE-LABEL: 'insert_subvector_concat_v6i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = shufflevector <6 x i8> %x, <6 x i8> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <12 x i8> %a
 ;
   %a = shufflevector <6 x i8> %x, <6 x i8> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <12 x i8> %a
@@ -71,6 +101,10 @@ define <8 x i8> @insert_subvector_concat_v8i8(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: 'insert_subvector_concat_v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %a
+;
+; CHECK-SIZE-LABEL: 'insert_subvector_concat_v8i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %a
 ;
   %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %a
@@ -78,4 +112,6 @@ define <8 x i8> @insert_subvector_concat_v8i8(<4 x i8> %x, <4 x i8> %y) {
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32: {{.*}}
+; RV32-SIZE: {{.*}}
 ; RV64: {{.*}}
+; RV64-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
index d0e2a7c71a107..af656b4d7976d 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 | FileCheck %s --check-prefix=SIZE
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32
 
@@ -17,6 +18,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'test_vXf64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -47,6 +62,20 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'test_vXi64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -94,6 +123,37 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'test_vXf32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %src64_128  = shufflevector <2 x float> %src64,  <2 x float> undef,  <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %src64_256  = shufflevector <2 x float> %src64,  <2 x float> undef,  <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -160,6 +220,37 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'test_vXi32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %src64_128  = shufflevector <2 x i32> %src64,  <2 x i32> undef,  <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %src64_256  = shufflevector <2 x i32> %src64,  <2 x i32> undef,  <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -230,6 +321,41 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'test_vXi16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %src32_64   = shufflevector <2 x i16> %src32,   <2 x i16> undef,   <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %src32_128  = shufflevector <2 x i16> %src32,   <2 x i16> undef,   <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -310,6 +436,46 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> %src64_128, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> %src64_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'test_vXi8'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_01 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32_23 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> %src16_128, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> %src16_128, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> %src16_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> %src16_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> %src16_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> %src16_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> %src16_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> %src16_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> %src32_128, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> %src32_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> %src32_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> %src32_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> %src64_128, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> %src64_128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %src16_32   = shufflevector <2 x i8> %src16,   <2 x i8> undef,   <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %src16_64   = shufflevector <2 x i8> %src16,   <2 x i8> undef,   <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
index 8f8ec20451f2b..8e98d6e3b60fb 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32
 
@@ -23,6 +24,24 @@ define void @general_permute_single_source() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'general_permute_single_source'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 9, i32 6, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 5, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 11, i32 11, i32 11, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 12, i32 12, i32 12, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
@@ -81,6 +100,37 @@ define void @general_permute_two_source() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'general_permute_two_source'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 3, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 3, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 3, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 3, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
index 4393505d3b28d..17deeb2cfafa6 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 | FileCheck %s --check-prefixes=SIZE
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32
 
@@ -31,6 +32,31 @@ define void @reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reverse'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll
index 8f47c481fff48..81454cc826e14 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v | FileCheck %s -check-prefixes=CHECK,RV32
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v | FileCheck %s -check-prefixes=CHECK,RV64
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v | FileCheck %s -check-prefixes=CHECK-SIZE,RV32-SIZE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv64 -mattr=+v | FileCheck %s -check-prefixes=CHECK-SIZE,RV64-SIZE
 
 define <8 x i8> @select_start_v8i8(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: 'select_start_v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
+;
+; CHECK-SIZE-LABEL: 'select_start_v8i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res
 ;
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
@@ -15,6 +21,10 @@ define <8 x i8> @select_non_contiguous_v8i8(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: 'select_non_contiguous_v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
+;
+; CHECK-SIZE-LABEL: 'select_non_contiguous_v8i8'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res
 ;
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i8> %res
@@ -24,6 +34,10 @@ define <8 x i64> @select_start_v8i64(<8 x i64> %v, <8 x i64> %w) {
 ; CHECK-LABEL: 'select_start_v8i64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %res
+;
+; CHECK-SIZE-LABEL: 'select_start_v8i64'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res
 ;
   %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i64> %res
@@ -33,10 +47,16 @@ define <8 x i64> @select_non_contiguous_v8i64(<8 x i64> %v, <8 x i64> %w) {
 ; CHECK-LABEL: 'select_non_contiguous_v8i64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %res
+;
+; CHECK-SIZE-LABEL: 'select_non_contiguous_v8i64'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res
 ;
   %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i64> %res
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32: {{.*}}
+; RV32-SIZE: {{.*}}
 ; RV64: {{.*}}
+; RV64-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-transpose.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-transpose.ll
index 8d7457ee5de67..c3f20c858ba5e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-transpose.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-transpose.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 | FileCheck %s --check-prefix=SIZE
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32
 
@@ -9,6 +10,10 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: 'trn1.v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %tmp0
+;
+; SIZE-LABEL: 'trn1.v8i8'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %tmp0
 ;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i8> %tmp0
@@ -18,6 +23,10 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: 'trn2.v8i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %tmp0
+;
+; SIZE-LABEL: 'trn2.v8i8'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %tmp0
 ;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i8> %tmp0
@@ -27,6 +36,10 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: 'trn1.v16i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %tmp0
+;
+; SIZE-LABEL: 'trn1.v16i8'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %tmp0
 ;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   ret <16 x i8> %tmp0
@@ -36,6 +49,10 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: 'trn2.v16i8'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %tmp0
+;
+; SIZE-LABEL: 'trn2.v16i8'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %tmp0
 ;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   ret <16 x i8> %tmp0
@@ -45,6 +62,10 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: 'trn1.v4i16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %tmp0
+;
+; SIZE-LABEL: 'trn1.v4i16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %tmp0
 ;
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i16> %tmp0
@@ -54,6 +75,10 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: 'trn2.v4i16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %tmp0
+;
+; SIZE-LABEL: 'trn2.v4i16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %tmp0
 ;
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i16> %tmp0
@@ -63,6 +88,10 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: 'trn1.v8i16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %tmp0
+;
+; SIZE-LABEL: 'trn1.v8i16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %tmp0
 ;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i16> %tmp0
@@ -72,6 +101,10 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: 'trn2.v8i16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %tmp0
+;
+; SIZE-LABEL: 'trn2.v8i16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %tmp0
 ;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i16> %tmp0
@@ -81,6 +114,10 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: 'trn1.v2i32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %tmp0
+;
+; SIZE-LABEL: 'trn1.v2i32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %tmp0
 ;
   %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %tmp0
@@ -90,6 +127,10 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: 'trn2.v2i32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %tmp0
+;
+; SIZE-LABEL: 'trn2.v2i32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 1, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %tmp0
 ;
   %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %tmp0
@@ -99,6 +140,10 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: 'trn1.v4i32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %tmp0
+;
+; SIZE-LABEL: 'trn1.v4i32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %tmp0
 ;
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i32> %tmp0
@@ -108,6 +153,10 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: 'trn2.v4i32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %tmp0
+;
+; SIZE-LABEL: 'trn2.v4i32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %tmp0
 ;
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i32> %tmp0
@@ -117,6 +166,10 @@ define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: 'trn1.v2i64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %tmp0
+;
+; SIZE-LABEL: 'trn1.v2i64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 2>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %tmp0
 ;
   %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %tmp0
@@ -126,6 +179,10 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: 'trn2.v2i64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %tmp0
+;
+; SIZE-LABEL: 'trn2.v2i64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 1, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %tmp0
 ;
   %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %tmp0
@@ -135,6 +192,10 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
 ; CHECK-LABEL: 'trn1.v2f32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %tmp0
+;
+; SIZE-LABEL: 'trn1.v2f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %tmp0
 ;
   %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %tmp0
@@ -144,6 +205,10 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
 ; CHECK-LABEL: 'trn2.v2f32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %tmp0
+;
+; SIZE-LABEL: 'trn2.v2f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 1, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %tmp0
 ;
   %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %tmp0
@@ -153,6 +218,10 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: 'trn1.v4f32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %tmp0
+;
+; SIZE-LABEL: 'trn1.v4f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %tmp0
 ;
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x float> %tmp0
@@ -162,6 +231,10 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: 'trn2.v4f32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %tmp0
+;
+; SIZE-LABEL: 'trn2.v4f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %tmp0
 ;
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x float> %tmp0
@@ -171,6 +244,10 @@ define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) {
 ; CHECK-LABEL: 'trn1.v2f64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %tmp0
+;
+; SIZE-LABEL: 'trn1.v2f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 2>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %tmp0
 ;
   %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %tmp0
@@ -180,6 +257,10 @@ define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {
 ; CHECK-LABEL: 'trn2.v2f64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %tmp0
+;
+; SIZE-LABEL: 'trn2.v2f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 1, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %tmp0
 ;
   %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %tmp0
@@ -189,6 +270,10 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: 'trn1.v4f16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %tmp0
+;
+; SIZE-LABEL: 'trn1.v4f16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x half> %tmp0
 ;
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x half> %tmp0
@@ -198,6 +283,10 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: 'trn2.v4f16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %tmp0
+;
+; SIZE-LABEL: 'trn2.v4f16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x half> %tmp0
 ;
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x half> %tmp0
@@ -207,6 +296,10 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: 'trn1.v8f16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %tmp0
+;
+; SIZE-LABEL: 'trn1.v8f16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x half> %tmp0
 ;
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x half> %tmp0
@@ -216,6 +309,10 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: 'trn2.v8f16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %tmp0
+;
+; SIZE-LABEL: 'trn2.v8f16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x half> %tmp0
 ;
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x half> %tmp0
diff --git a/llvm/test/Analysis/CostModel/RISCV/splice.ll b/llvm/test/Analysis/CostModel/RISCV/splice.ll
index 89015c48c6c47..c70c879dba5ab 100644
--- a/llvm/test/Analysis/CostModel/RISCV/splice.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/splice.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
 
 define void @vector_splice() {
 ; CHECK-LABEL: 'vector_splice'
@@ -53,6 +54,58 @@ define void @vector_splice() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'vector_splice'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
   %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)

From 26ff659c39a67fe60517b45d4a954231f77c1350 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 4 Jan 2024 13:16:35 +0000
Subject: [PATCH 229/313] [AMDGPU] Remove some unused check prefixes

---
 .../GlobalISel/extractelement-stack-lower.ll  | 278 ------------------
 1 file changed, 278 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index ae78f4ee02f37..e132f7a0ec757 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -31,86 +31,6 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-; GCN-GFX12-LABEL: v_extract_v64i32_varidx:
-; GCN-GFX12:       ; %bb.0:
-; GCN-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-GFX12-NEXT:    s_wait_expcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_storecnt 0x0
-; GCN-GFX12-NEXT:    s_wait_samplecnt 0x0
-; GCN-GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_kmcnt 0x0
-; GCN-GFX12-NEXT:    s_clause 0xf
-; GCN-GFX12-NEXT:    global_load_b128 v[3:6], v[0:1], off
-; GCN-GFX12-NEXT:    global_load_b128 v[7:10], v[0:1], off offset:16
-; GCN-GFX12-NEXT:    global_load_b128 v[11:14], v[0:1], off offset:32
-; GCN-GFX12-NEXT:    global_load_b128 v[15:18], v[0:1], off offset:48
-; GCN-GFX12-NEXT:    global_load_b128 v[19:22], v[0:1], off offset:64
-; GCN-GFX12-NEXT:    global_load_b128 v[23:26], v[0:1], off offset:80
-; GCN-GFX12-NEXT:    global_load_b128 v[27:30], v[0:1], off offset:96
-; GCN-GFX12-NEXT:    global_load_b128 v[31:34], v[0:1], off offset:112
-; GCN-GFX12-NEXT:    global_load_b128 v[35:38], v[0:1], off offset:128
-; GCN-GFX12-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:144
-; GCN-GFX12-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:160
-; GCN-GFX12-NEXT:    global_load_b128 v[64:67], v[0:1], off offset:176
-; GCN-GFX12-NEXT:    global_load_b128 v[68:71], v[0:1], off offset:192
-; GCN-GFX12-NEXT:    global_load_b128 v[80:83], v[0:1], off offset:208
-; GCN-GFX12-NEXT:    global_load_b128 v[84:87], v[0:1], off offset:224
-; GCN-GFX12-NEXT:    global_load_b128 v[96:99], v[0:1], off offset:240
-; GCN-GFX12-NEXT:    v_and_b32_e32 v0, 63, v2
-; GCN-GFX12-NEXT:    s_mov_b32 s0, s33
-; GCN-GFX12-NEXT:    s_add_co_i32 s33, s32, 0xff
-; GCN-GFX12-NEXT:    s_addk_co_i32 s32, 0x300
-; GCN-GFX12-NEXT:    s_and_b32 s33, s33, 0xffffff00
-; GCN-GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-GFX12-NEXT:    s_addk_co_i32 s32, 0xfd00
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xf
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[3:6], s33
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xe
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[7:10], s33 offset:16
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xd
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[11:14], s33 offset:32
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xc
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[15:18], s33 offset:48
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xb
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[19:22], s33 offset:64
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xa
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[23:26], s33 offset:80
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x9
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[27:30], s33 offset:96
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x8
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[31:34], s33 offset:112
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x7
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[35:38], s33 offset:128
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x6
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[48:51], s33 offset:144
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x5
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[52:55], s33 offset:160
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x4
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[64:67], s33 offset:176
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x3
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[68:71], s33 offset:192
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x2
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[80:83], s33 offset:208
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x1
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[84:87], s33 offset:224
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x0
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[96:99], s33 offset:240
-; GCN-GFX12-NEXT:    scratch_load_b32 v0, v0, s33
-; GCN-GFX12-NEXT:    s_mov_b32 s33, s0
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_storecnt 0x0
-; GCN-GFX12-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: v_extract_v64i32_varidx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 63, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
-; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    global_load_dword v0, v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 %idx
   ret i32 %elt
@@ -142,124 +62,6 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX12-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-; GCN-GFX12-LABEL: v_extract_v128i16_varidx:
-; GCN-GFX12:       ; %bb.0:
-; GCN-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-GFX12-NEXT:    s_wait_expcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_storecnt 0x0
-; GCN-GFX12-NEXT:    s_wait_samplecnt 0x0
-; GCN-GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_kmcnt 0x0
-; GCN-GFX12-NEXT:    s_mov_b32 s0, s33
-; GCN-GFX12-NEXT:    s_add_co_i32 s33, s32, 0xff
-; GCN-GFX12-NEXT:    s_addk_co_i32 s32, 0x400
-; GCN-GFX12-NEXT:    s_and_b32 s33, s33, 0xffffff00
-; GCN-GFX12-NEXT:    s_clause 0xf
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v40, s33 offset:60
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v41, s33 offset:56
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v42, s33 offset:52
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v43, s33 offset:48
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v44, s33 offset:44
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v45, s33 offset:40
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v46, s33 offset:36
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v47, s33 offset:32
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v56, s33 offset:28
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v57, s33 offset:24
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v58, s33 offset:20
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v59, s33 offset:16
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v60, s33 offset:12
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v61, s33 offset:8
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v62, s33 offset:4
-; GCN-GFX12-NEXT:    scratch_store_b32 off, v63, s33
-; GCN-GFX12-NEXT:    s_clause 0xf
-; GCN-GFX12-NEXT:    global_load_b128 v[16:19], v[0:1], off
-; GCN-GFX12-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
-; GCN-GFX12-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:32
-; GCN-GFX12-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:48
-; GCN-GFX12-NEXT:    global_load_b128 v[32:35], v[0:1], off offset:64
-; GCN-GFX12-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:80
-; GCN-GFX12-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:96
-; GCN-GFX12-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:112
-; GCN-GFX12-NEXT:    global_load_b128 v[60:63], v[0:1], off offset:128
-; GCN-GFX12-NEXT:    global_load_b128 v[56:59], v[0:1], off offset:144
-; GCN-GFX12-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:160
-; GCN-GFX12-NEXT:    global_load_b128 v[44:47], v[0:1], off offset:176
-; GCN-GFX12-NEXT:    global_load_b128 v[36:39], v[0:1], off offset:192
-; GCN-GFX12-NEXT:    global_load_b128 v[40:43], v[0:1], off offset:208
-; GCN-GFX12-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:224
-; GCN-GFX12-NEXT:    global_load_b128 v[64:67], v[0:1], off offset:240
-; GCN-GFX12-NEXT:    v_bfe_u32 v0, v2, 1, 6
-; GCN-GFX12-NEXT:    v_and_b32_e32 v1, 1, v2
-; GCN-GFX12-NEXT:    s_addk_co_i32 s32, 0xfc00
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xf
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:256
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xe
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[4:7], s33 offset:272
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xd
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[8:11], s33 offset:288
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xc
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:304
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xb
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[32:35], s33 offset:320
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xa
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:336
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x9
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:352
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x8
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[20:23], s33 offset:368
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x7
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[60:63], s33 offset:384
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x6
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[56:59], s33 offset:400
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x5
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[52:55], s33 offset:416
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x4
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[44:47], s33 offset:432
-; GCN-GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-GFX12-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x3
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[36:39], s33 offset:448
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x2
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[40:43], s33 offset:464
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x1
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[48:51], s33 offset:480
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x0
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[64:67], s33 offset:496
-; GCN-GFX12-NEXT:    scratch_load_b32 v0, v0, s33 offset:256
-; GCN-GFX12-NEXT:    s_clause 0xf
-; GCN-GFX12-NEXT:    scratch_load_b32 v63, off, s33
-; GCN-GFX12-NEXT:    scratch_load_b32 v62, off, s33 offset:4
-; GCN-GFX12-NEXT:    scratch_load_b32 v61, off, s33 offset:8
-; GCN-GFX12-NEXT:    scratch_load_b32 v60, off, s33 offset:12
-; GCN-GFX12-NEXT:    scratch_load_b32 v59, off, s33 offset:16
-; GCN-GFX12-NEXT:    scratch_load_b32 v58, off, s33 offset:20
-; GCN-GFX12-NEXT:    scratch_load_b32 v57, off, s33 offset:24
-; GCN-GFX12-NEXT:    scratch_load_b32 v56, off, s33 offset:28
-; GCN-GFX12-NEXT:    scratch_load_b32 v47, off, s33 offset:32
-; GCN-GFX12-NEXT:    scratch_load_b32 v46, off, s33 offset:36
-; GCN-GFX12-NEXT:    scratch_load_b32 v45, off, s33 offset:40
-; GCN-GFX12-NEXT:    scratch_load_b32 v44, off, s33 offset:44
-; GCN-GFX12-NEXT:    scratch_load_b32 v43, off, s33 offset:48
-; GCN-GFX12-NEXT:    scratch_load_b32 v42, off, s33 offset:52
-; GCN-GFX12-NEXT:    scratch_load_b32 v41, off, s33 offset:56
-; GCN-GFX12-NEXT:    scratch_load_b32 v40, off, s33 offset:60
-; GCN-GFX12-NEXT:    s_mov_b32 s33, s0
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x10
-; GCN-GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_storecnt 0x0
-; GCN-GFX12-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: v_extract_v128i16_varidx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 0x7f, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    global_load_ushort v0, v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <128 x i16>, ptr addrspace(1) %ptr
   %elt = extractelement <128 x i16> %vec, i32 %idx
   ret i16 %elt
@@ -291,86 +93,6 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX12-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-; GCN-GFX12-LABEL: v_extract_v32i64_varidx:
-; GCN-GFX12:       ; %bb.0:
-; GCN-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-GFX12-NEXT:    s_wait_expcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_storecnt 0x0
-; GCN-GFX12-NEXT:    s_wait_samplecnt 0x0
-; GCN-GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_kmcnt 0x0
-; GCN-GFX12-NEXT:    s_clause 0xf
-; GCN-GFX12-NEXT:    global_load_b128 v[3:6], v[0:1], off
-; GCN-GFX12-NEXT:    global_load_b128 v[7:10], v[0:1], off offset:16
-; GCN-GFX12-NEXT:    global_load_b128 v[11:14], v[0:1], off offset:32
-; GCN-GFX12-NEXT:    global_load_b128 v[15:18], v[0:1], off offset:48
-; GCN-GFX12-NEXT:    global_load_b128 v[19:22], v[0:1], off offset:64
-; GCN-GFX12-NEXT:    global_load_b128 v[23:26], v[0:1], off offset:80
-; GCN-GFX12-NEXT:    global_load_b128 v[27:30], v[0:1], off offset:96
-; GCN-GFX12-NEXT:    global_load_b128 v[31:34], v[0:1], off offset:112
-; GCN-GFX12-NEXT:    global_load_b128 v[35:38], v[0:1], off offset:128
-; GCN-GFX12-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:144
-; GCN-GFX12-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:160
-; GCN-GFX12-NEXT:    global_load_b128 v[64:67], v[0:1], off offset:176
-; GCN-GFX12-NEXT:    global_load_b128 v[68:71], v[0:1], off offset:192
-; GCN-GFX12-NEXT:    global_load_b128 v[80:83], v[0:1], off offset:208
-; GCN-GFX12-NEXT:    global_load_b128 v[84:87], v[0:1], off offset:224
-; GCN-GFX12-NEXT:    global_load_b128 v[96:99], v[0:1], off offset:240
-; GCN-GFX12-NEXT:    v_and_b32_e32 v0, 31, v2
-; GCN-GFX12-NEXT:    s_mov_b32 s0, s33
-; GCN-GFX12-NEXT:    s_add_co_i32 s33, s32, 0xff
-; GCN-GFX12-NEXT:    s_addk_co_i32 s32, 0x300
-; GCN-GFX12-NEXT:    s_and_b32 s33, s33, 0xffffff00
-; GCN-GFX12-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-GFX12-NEXT:    s_addk_co_i32 s32, 0xfd00
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xf
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[3:6], s33
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xe
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[7:10], s33 offset:16
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xd
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[11:14], s33 offset:32
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xc
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[15:18], s33 offset:48
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xb
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[19:22], s33 offset:64
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0xa
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[23:26], s33 offset:80
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x9
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[27:30], s33 offset:96
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x8
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[31:34], s33 offset:112
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x7
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[35:38], s33 offset:128
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x6
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[48:51], s33 offset:144
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x5
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[52:55], s33 offset:160
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x4
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[64:67], s33 offset:176
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x3
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[68:71], s33 offset:192
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x2
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[80:83], s33 offset:208
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x1
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[84:87], s33 offset:224
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x0
-; GCN-GFX12-NEXT:    scratch_store_b128 off, v[96:99], s33 offset:240
-; GCN-GFX12-NEXT:    scratch_load_b64 v[0:1], v0, s33
-; GCN-GFX12-NEXT:    s_mov_b32 s33, s0
-; GCN-GFX12-NEXT:    s_wait_loadcnt 0x0
-; GCN-GFX12-NEXT:    s_wait_storecnt 0x0
-; GCN-GFX12-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: v_extract_v32i64_varidx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 31, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <32 x i64>, ptr addrspace(1) %ptr
   %elt = extractelement <32 x i64> %vec, i32 %idx
   ret i64 %elt

From 4de971c230e8202cfd54d4d6c36719db628d35ce Mon Sep 17 00:00:00 2001
From: Qizhi Hu <836744285@qq.com>
Date: Thu, 4 Jan 2024 21:21:53 +0800
Subject: [PATCH 230/313] [clang][ASTImporter] import InstantiatedFromMember of
 ClassTemplateSpecializationDecl (#76493)

import of `ClassTemplateSpecializationDecl` didn't set
`InstantiatedFromMember` and this makes ast-dump crash. import and set
`InstantiatedFromMember`. fix
[issue](https://github.com/llvm/llvm-project/issues/76469)

Co-authored-by: huqizhi <836744285@qq.com>
---
 clang/lib/AST/ASTImporter.cpp           |  5 ++++
 clang/unittests/AST/ASTImporterTest.cpp | 33 +++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index b61180c4f3491..9ffae72346f2a 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -6141,6 +6141,11 @@ ExpectedDecl ASTNodeImporter::VisitClassTemplateSpecializationDecl(
                                                   InsertPos))
       // Add this partial specialization to the class template.
       ClassTemplate->AddPartialSpecialization(PartSpec2, InsertPos);
+    if (Expected<ClassTemplatePartialSpecializationDecl *> ToInstOrErr =
+            import(PartialSpec->getInstantiatedFromMember()))
+      PartSpec2->setInstantiatedFromMember(*ToInstOrErr);
+    else
+      return ToInstOrErr.takeError();
 
     updateLookupTableForTemplateParameters(*ToTPList);
   } else { // Not a partial specialization.
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index ed8ecb080e268..e4bd0d646cc9d 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9342,6 +9342,39 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportConflictTypeAliasTemplate) {
   EXPECT_FALSE(ImportedCallable);
 }
 
+AST_MATCHER(ClassTemplateSpecializationDecl, hasInstantiatedFromMember) {
+  if (auto Instantiate = Node.getInstantiatedFrom()) {
+    if (auto *FromPartialSpecialization =
+            Instantiate.get<ClassTemplatePartialSpecializationDecl *>()) {
+      return nullptr != FromPartialSpecialization->getInstantiatedFromMember();
+    }
+  }
+  return false;
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportInstantiatedFromMember) {
+  const char *Code =
+      R"(
+      template <typename> struct B {
+        template <typename, bool = false> union D;
+        template <typename T> union D<T> {};
+        D<int> d;
+      };
+      B<int> b;
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
+  auto *FromD = FirstDeclMatcher<ClassTemplateSpecializationDecl>().match(
+      FromTU, classTemplateSpecializationDecl(hasName("D"),
+                                              hasInstantiatedFromMember()));
+  auto *FromPartialSpecialization =
+      cast<ClassTemplatePartialSpecializationDecl *>(
+          FromD->getInstantiatedFrom());
+  ASSERT_TRUE(FromPartialSpecialization->getInstantiatedFromMember());
+  auto *ImportedPartialSpecialization =
+      Import(FromPartialSpecialization, Lang_CXX11);
+  EXPECT_TRUE(ImportedPartialSpecialization->getInstantiatedFromMember());
+}
+
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
                          DefaultTestValuesForRunOptions);
 

From 62144969bc03490908d46675f3d6645cbe248d25 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 4 Jan 2024 14:26:06 +0100
Subject: [PATCH 231/313] [ConstraintElim] Add debug output for failed
 preconditions

Print debug output if a constraint does not get added due to a
failed precondition.
---
 .../Scalar/ConstraintElimination.cpp           | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 5e57aa7817516..9a814ba9fd738 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1642,15 +1642,14 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
                            DFSInStack);
     }
 
-    LLVM_DEBUG(dbgs() << "Processing ");
-
     // For a block, check if any CmpInsts become known based on the current set
     // of constraints.
     if (CB.isCheck()) {
       Instruction *Inst = CB.getInstructionToSimplify();
       if (!Inst)
         continue;
-      LLVM_DEBUG(dbgs() << "condition to simplify: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "Processing condition to simplify: " << *Inst
+                        << "\n");
       if (auto *II = dyn_cast<WithOverflowInst>(Inst)) {
         Changed |= tryToSimplifyOverflowMath(II, Info, ToRemove);
       } else if (auto *Cmp = dyn_cast<ICmpInst>(Inst)) {
@@ -1669,7 +1668,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
     }
 
     auto AddFact = [&](CmpInst::Predicate Pred, Value *A, Value *B) {
-      LLVM_DEBUG(dbgs() << "fact to add to the system: ";
+      LLVM_DEBUG(dbgs() << "Processing fact to add to the system: ";
                  dumpUnpackedICmp(dbgs(), Pred, A, B); dbgs() << "\n");
       if (Info.getCS(CmpInst::isSigned(Pred)).size() > MaxRows) {
         LLVM_DEBUG(
@@ -1718,8 +1717,17 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
       A = CB.Cond.Op0;
       B = CB.Cond.Op1;
       if (CB.DoesHold.Pred != CmpInst::BAD_ICMP_PREDICATE &&
-          !Info.doesHold(CB.DoesHold.Pred, CB.DoesHold.Op0, CB.DoesHold.Op1))
+          !Info.doesHold(CB.DoesHold.Pred, CB.DoesHold.Op0, CB.DoesHold.Op1)) {
+        LLVM_DEBUG({
+          dbgs() << "Not adding fact ";
+          dumpUnpackedICmp(dbgs(), Pred, A, B);
+          dbgs() << " because precondition ";
+          dumpUnpackedICmp(dbgs(), CB.DoesHold.Pred, CB.DoesHold.Op0,
+                           CB.DoesHold.Op1);
+          dbgs() << " does not hold.\n";
+        });
         continue;
+      }
     } else {
       bool Matched = match(CB.Inst, m_Intrinsic<Intrinsic::assume>(
                                         m_ICmp(Pred, m_Value(A), m_Value(B))));

From 9803de0e8e3abbbc94a4265d5847db435897a384 Mon Sep 17 00:00:00 2001
From: Chaitanya <Krishna.Sankisa@amd.com>
Date: Thu, 4 Jan 2024 19:05:12 +0530
Subject: [PATCH 232/313] [AMDGPU] Add dynamic LDS size implicit kernel
 argument to CO-v5 (#65273)

"hidden_dynamic_lds_size" argument will be added in the reserved section
at offset 120 of the implicit argument layout.
Add "isDynamicLDSUsed" flag to AMDGPUMachineFunction to identify if a
function uses dynamic LDS.

hidden argument will be added in below cases:

- LDS global is used in the kernel.
- Kernel calls a function which uses LDS global.
- LDS pointer is passed as argument to kernel itself.
---
 llvm/docs/AMDGPUUsage.rst                     |   3 +
 .../BinaryFormat/AMDGPUMetadataVerifier.cpp   |   1 +
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  10 +-
 .../Target/AMDGPU/AMDGPUMachineFunction.cpp   |  39 ++++--
 .../lib/Target/AMDGPU/AMDGPUMachineFunction.h |   7 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   1 +
 ...hsa-metadata-dynlds-func-hidden-args-v5.ll | 124 +++++++++++++++++
 ...-metadata-dynlds-funcarg-hidden-args-v5.ll | 124 +++++++++++++++++
 ...-metadata-dynlds-kernarg-hidden-args-v5.ll | 125 ++++++++++++++++++
 .../AMDGPU/hsa-metadata-hidden-args-v5.ll     |   7 +-
 10 files changed, 430 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 371e583c22a83..e05f7fc3e7662 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4114,6 +4114,9 @@ Code object V5 metadata is the same as
                                                        buffer that conforms to the requirements of the malloc/free
                                                        device library V1 version implementation.
 
+                                                     "hidden_dynamic_lds_size"
+                                                       Size of the dynamically allocated LDS memory is passed in the kernarg.
+
                                                      "hidden_private_base"
                                                        The high 32 bits of the flat addressing private aperture base.
                                                        Only used by GFX8 to allow conversion between private segment
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index dda3380c04ea9..33eed07c46292 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -134,6 +134,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
                                .Case("hidden_default_queue", true)
                                .Case("hidden_completion_action", true)
                                .Case("hidden_multigrid_sync_arg", true)
+                               .Case("hidden_dynamic_lds_size", true)
                                .Case("hidden_private_base", true)
                                .Case("hidden_shared_base", true)
                                .Case("hidden_queue_ptr", true)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b51a876750b58..74e9cd7d09654 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
     Offset += 8; // Skipped.
   }
 
-  Offset += 72; // Reserved.
+  // Emit argument for hidden dynamic lds size
+  if (MFI.isDynamicLDSUsed()) {
+    emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
+                  Args);
+  } else {
+    Offset += 4; // skipped
+  }
+
+  Offset += 68; // Reserved.
 
   // hidden_private_base and hidden_shared_base are only when the subtarget has
   // ApertureRegs.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 323462e60a29f..31777295b4f8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -19,6 +19,26 @@
 
 using namespace llvm;
 
+static const GlobalVariable *
+getKernelDynLDSGlobalFromFunction(const Function &F) {
+  const Module *M = F.getParent();
+  SmallString<64> KernelDynLDSName("llvm.amdgcn.");
+  KernelDynLDSName += F.getName();
+  KernelDynLDSName += ".dynlds";
+  return M->getNamedGlobal(KernelDynLDSName);
+}
+
+static bool hasLDSKernelArgument(const Function &F) {
+  for (const Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+    if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) {
+      if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+        return true;
+    }
+  }
+  return false;
+}
+
 AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
                                              const AMDGPUSubtarget &ST)
     : IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
@@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
   Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
   NoSignedZerosFPMath =
       NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
+
+  const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
+  if (DynLdsGlobal || hasLDSKernelArgument(F))
+    UsesDynamicLDS = true;
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
@@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
   return Offset;
 }
 
-static const GlobalVariable *
-getKernelDynLDSGlobalFromFunction(const Function &F) {
-  const Module *M = F.getParent();
-  std::string KernelDynLDSName = "llvm.amdgcn.";
-  KernelDynLDSName += F.getName();
-  KernelDynLDSName += ".dynlds";
-  return M->getNamedGlobal(KernelDynLDSName);
-}
-
 std::optional<uint32_t>
 AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
   // TODO: Would be more consistent with the abs symbols to use a range
@@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
     }
   }
 }
+
+void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) {
+  UsesDynamicLDS = DynLDS;
+}
+
+bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 248ee26a47eb1..7efb7f825348e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -46,6 +46,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// stages.
   Align DynLDSAlign;
 
+  // Flag to check dynamic LDS usage by kernel.
+  bool UsesDynamicLDS = false;
+
   // Kernels + shaders. i.e. functions called by the hardware and not called
   // by other functions.
   bool IsEntryFunction = false;
@@ -119,6 +122,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
   void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
+
+  void setUsesDynamicLDS(bool DynLDS);
+
+  bool isDynamicLDSUsed() const;
 };
 
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0e857e6ac71b6..b481ae43e8215 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6890,6 +6890,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
         // Adjust alignment for that dynamic shared memory array.
         Function &F = DAG.getMachineFunction().getFunction();
         MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
+        MFI->setUsesDynamicLDS(true);
         return SDValue(
             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
       }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
new file mode 100644
index 0000000000000..cb15ff9fcb1bc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
@@ -0,0 +1,124 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK:	amdhsa.kernels:
+; CHECK-NEXT:       - .args:
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           r
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           a
+; CHECK-NEXT:         .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           b
+; CHECK-NEXT:         .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .offset:         24
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_x
+; CHECK-NEXT:      - .offset:         28
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_y
+; CHECK-NEXT:      - .offset:         32
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_z
+; CHECK-NEXT:      - .offset:         36
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_x
+; CHECK-NEXT:      - .offset:         38
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_y
+; CHECK-NEXT:      - .offset:         40
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_z
+; CHECK-NEXT:      - .offset:         42
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_x
+; CHECK-NEXT:      - .offset:         44
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_y
+; CHECK-NEXT:      - .offset:         46
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_z
+; CHECK-NEXT:      - .offset:         64
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:      - .offset:         72
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:      - .offset:         80
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:      - .offset:         88
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_grid_dims
+; CHECK-NEXT:      - .offset:         96
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:      - .offset:         104
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_hostcall_buffer
+; CHECK-NEXT:      - .offset:         112
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_multigrid_sync_arg
+; CHECK-NEXT:      - .offset:         120
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_heap_v1
+; CHECK-NEXT:      - .offset:         128
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_default_queue
+; CHECK-NEXT:      - .offset:         136
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; CHECK:          - .offset:          144
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_dynamic_lds_size
+; CHECK:          - .offset:          224
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+
+; CHECK:          .name:           test_v5
+; CHECK:          .symbol:         test_v5.kd
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+@lds = external hidden addrspace(3) global [0 x i32], align 4
+
+define void @funcs_dyn_lds() {
+  store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_v5(
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #0 {
+entry:
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, ptr addrspace(1) %r
+  call void @funcs_dyn_lds()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
new file mode 100644
index 0000000000000..16bfe5f019683
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
@@ -0,0 +1,124 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK:	amdhsa.kernels:
+; CHECK-NEXT:       - .args:
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           r
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           a
+; CHECK-NEXT:         .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           b
+; CHECK-NEXT:         .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .offset:         24
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_x
+; CHECK-NEXT:      - .offset:         28
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_y
+; CHECK-NEXT:      - .offset:         32
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_z
+; CHECK-NEXT:      - .offset:         36
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_x
+; CHECK-NEXT:      - .offset:         38
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_y
+; CHECK-NEXT:      - .offset:         40
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_z
+; CHECK-NEXT:      - .offset:         42
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_x
+; CHECK-NEXT:      - .offset:         44
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_y
+; CHECK-NEXT:      - .offset:         46
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_z
+; CHECK-NEXT:      - .offset:         64
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:      - .offset:         72
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:      - .offset:         80
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:      - .offset:         88
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_grid_dims
+; CHECK-NEXT:      - .offset:         96
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:      - .offset:         104
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_hostcall_buffer
+; CHECK-NEXT:      - .offset:         112
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_multigrid_sync_arg
+; CHECK-NEXT:      - .offset:         120
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_heap_v1
+; CHECK-NEXT:      - .offset:         128
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_default_queue
+; CHECK-NEXT:      - .offset:         136
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; CHECK:          - .offset:          144
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_dynamic_lds_size
+; CHECK:          - .offset:          224
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+
+; CHECK:          .name:           test_v5
+; CHECK:          .symbol:         test_v5.kd
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+@lds = external hidden addrspace(3) global [0 x i32], align 4
+
+define void @funcs_dyn_lds(ptr addrspace(3) %lds_ptr) {
+  store i32 1234, ptr addrspace(3) %lds_ptr, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_v5(
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #0 {
+entry:
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, ptr addrspace(1) %r
+  call void @funcs_dyn_lds(ptr addrspace(3) @lds)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll
new file mode 100644
index 0000000000000..d457c61b8d408
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK:	amdhsa.kernels:
+; CHECK-NEXT:       - .args:
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           r
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           a
+; CHECK-NEXT:         .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           b
+; CHECK-NEXT:         .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  local
+; CHECK-NEXT:         .name:           lds_ptr
+; CHECK-NEXT:         .offset:         24
+; CHECK-NEXT:         .pointee_align:  1
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:         .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       - .offset:         32
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_x
+; CHECK-NEXT:      - .offset:         36
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_y
+; CHECK-NEXT:      - .offset:         40
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_z
+; CHECK-NEXT:      - .offset:         44
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_x
+; CHECK-NEXT:      - .offset:         46
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_y
+; CHECK-NEXT:      - .offset:         48
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_z
+; CHECK-NEXT:      - .offset:         50
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_x
+; CHECK-NEXT:      - .offset:         52
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_y
+; CHECK-NEXT:      - .offset:         54
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_z
+; CHECK-NEXT:      - .offset:         72
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:      - .offset:         80
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:      - .offset:         88
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:      - .offset:         96
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_grid_dims
+; CHECK-NEXT:      - .offset:         104
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:      - .offset:         112
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_hostcall_buffer
+; CHECK-NEXT:      - .offset:         120
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_multigrid_sync_arg
+; CHECK-NEXT:      - .offset:         128
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_heap_v1
+; CHECK-NEXT:      - .offset:         136
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_default_queue
+; CHECK-NEXT:      - .offset:         144
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; CHECK:          - .offset:          152
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_dynamic_lds_size
+; CHECK:          - .offset:          232
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+
+; CHECK:          .name:           test_v5
+; CHECK:          .symbol:         test_v5.kd
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+
+define amdgpu_kernel void @test_v5(
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(3) %lds_ptr) #0 {
+entry:
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, ptr addrspace(1) %r
+  store i32 1234, ptr addrspace(3) %lds_ptr, align 4
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
index cb30d668674c3..1a2ce636c733c 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
@@ -81,13 +81,16 @@
 ; CHECK-NEXT:      - .offset:         136
 ; CHECK-NEXT:        .size:           8
 ; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; CHECK:          - .offset:          144
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_dynamic_lds_size
 ; GFX8-NEXT:      - .offset:         216
 ; GFX8-NEXT:        .size:           4
 ; GFX8-NEXT:        .value_kind:     hidden_private_base
 ; GFX8-NEXT:      - .offset:         220
 ; GFX8-NEXT:        .size:           4
 ; GFX8-NEXT:        .value_kind:     hidden_shared_base
-; CHECK:      - .offset:         224
+; CHECK:          - .offset:          224
 ; CHECK-NEXT:        .size:           8
 ; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
 
@@ -97,6 +100,7 @@
 ; CHECK:  amdhsa.version:
 ; CHECK-NEXT: - 1
 ; CHECK-NEXT: - 2
+@lds = external hidden addrspace(3) global [0 x i32], align 4
 define amdgpu_kernel void @test_v5(
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -106,6 +110,7 @@ entry:
   %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
   store half %r.val, ptr addrspace(1) %r
+  store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
   ret void
 }
 

From 3fd081f71eb6f0cda842995959df86077c2d6559 Mon Sep 17 00:00:00 2001
From: alekuz01 <aleksei.kuzmenko@arm.com>
Date: Thu, 4 Jan 2024 14:02:25 +0000
Subject: [PATCH 233/313] [llvm][doc][NFC] Fix typo in documentation in
 CMake.rst (#76836)

Fix for #71941
---
 llvm/docs/CMake.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 7dd3fd26022e5..13d1912ceb2ab 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -1118,10 +1118,10 @@ And then changing ``<project dir>/<pass name>/CMakeLists.txt`` to
 When you are done developing your pass, you may wish to integrate it
 into the LLVM source tree. You can achieve it in two easy steps:
 
-#. Copying ``<pass name>`` folder into ``<LLVM root>/lib/Transform`` directory.
+#. Copying ``<pass name>`` folder into ``<LLVM root>/lib/Transforms`` directory.
 
 #. Adding ``add_subdirectory(<pass name>)`` line into
-   ``<LLVM root>/lib/Transform/CMakeLists.txt``.
+   ``<LLVM root>/lib/Transforms/CMakeLists.txt``.
 
 Compiler/Platform-specific topics
 =================================

From db34a94710bc93970b5a873b1eef5c7ae2a3e046 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 4 Jan 2024 15:10:56 +0100
Subject: [PATCH 234/313] [ConstraintElim] Add tests for shl nsw decomposition
 (NFC)

---
 .../Transforms/ConstraintElimination/shl.ll   | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/llvm/test/Transforms/ConstraintElimination/shl.ll b/llvm/test/Transforms/ConstraintElimination/shl.ll
index 9fe8c147017b0..8a00eb9b2830b 100644
--- a/llvm/test/Transforms/ConstraintElimination/shl.ll
+++ b/llvm/test/Transforms/ConstraintElimination/shl.ll
@@ -1274,3 +1274,90 @@ entry:
   %shl.cmp = icmp uge i256 %shl.ub, 1
   ret i1 %shl.cmp
 }
+
+define i1 @shl_nsw_x8_slt_x7(i8 %start, i8 %high) {
+; CHECK-LABEL: @shl_nsw_x8_slt_x7(
+; CHECK-NEXT:    [[C_0:%.*]] = icmp sge i8 [[HIGH:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C_0]])
+; CHECK-NEXT:    [[START_SHL_3:%.*]] = shl nsw i8 [[START:%.*]], 3
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i8 [[START_SHL_3]], [[HIGH]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    [[START_MUL_7:%.*]] = mul nsw i8 [[START]], 7
+; CHECK-NEXT:    [[T_1:%.*]] = icmp slt i8 [[START_MUL_7]], [[HIGH]]
+; CHECK-NEXT:    ret i1 [[T_1]]
+;
+  %c.0 = icmp sge i8 %high, 0
+  call void @llvm.assume(i1 %c.0)
+
+  %start.shl.3 = shl nsw i8 %start, 3
+  %c.1 = icmp slt i8 %start.shl.3, %high
+  call void @llvm.assume(i1 %c.1)
+
+  %start.mul.7 = mul nsw i8 %start, 7
+  %t.1 = icmp slt i8 %start.mul.7, %high
+  ret i1 %t.1
+}
+
+define i1 @shl_nsw_x8_not_slt_x9(i8 %start, i8 %high) {
+; CHECK-LABEL: @shl_nsw_x8_not_slt_x9(
+; CHECK-NEXT:    [[C_0:%.*]] = icmp sge i8 [[HIGH:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C_0]])
+; CHECK-NEXT:    [[START_SHL_3:%.*]] = shl nsw i8 [[START:%.*]], 3
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i8 [[START_SHL_3]], [[HIGH]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    [[START_MUL_9:%.*]] = mul nsw i8 [[START]], 9
+; CHECK-NEXT:    [[T_1:%.*]] = icmp slt i8 [[START_MUL_9]], [[HIGH]]
+; CHECK-NEXT:    ret i1 [[T_1]]
+;
+  %c.0 = icmp sge i8 %high, 0
+  call void @llvm.assume(i1 %c.0)
+
+  %start.shl.3 = shl nsw i8 %start, 3
+  %c.1 = icmp slt i8 %start.shl.3, %high
+  call void @llvm.assume(i1 %c.1)
+
+  %start.mul.9 = mul nsw i8 %start, 9
+  %t.1 = icmp slt i8 %start.mul.9, %high
+  ret i1 %t.1
+}
+
+define i1 @shl_nsw_sign_implication(i8 %x) {
+; CHECK-LABEL: @shl_nsw_sign_implication(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 [[X:%.*]], 2
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 [[SHL]], 0
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       else:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sge i8 [[SHL]], 0
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %shl = shl nsw i8 %x, 2
+  %cmp1 = icmp slt i8 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %cmp2 = icmp slt i8 %shl, 0
+  ret i1 %cmp2
+
+else:
+  %cmp3 = icmp sge i8 %shl, 0
+  ret i1 %cmp3
+}
+
+define i1 @shl_nsw_by_bw_minus_1(i64 %x) {
+; CHECK-LABEL: @shl_nsw_by_bw_minus_1(
+; CHECK-NEXT:    [[X_SHL:%.*]] = shl nsw i64 [[X:%.*]], 63
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[X_SHL]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    [[T_1:%.*]] = icmp slt i64 [[X]], 0
+; CHECK-NEXT:    ret i1 [[T_1]]
+;
+  %x.shl = shl nsw i64 %x, 63
+  %c.1 = icmp slt i64 %x.shl, 0
+  call void @llvm.assume(i1 %c.1)
+
+  %t.1 = icmp slt i64 %x, 0
+  ret i1 %t.1
+}

From 4e281e2cb717c9bc6af2c01f56857d352adbe10e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mauro=20Balad=C3=A9s?= <mauro.balades@tutanota.com>
Date: Thu, 4 Jan 2024 15:22:54 +0100
Subject: [PATCH 235/313] [NFC][Clang] Avoid copying Param and Constr (#65488)


From e947b63516c8f9884e84bca000f7e9d9db7701c1 Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin@amd.com>
Date: Thu, 4 Jan 2024 15:33:35 +0100
Subject: [PATCH 236/313] [AMDGPU][NFC] Update alias test

Alias test should contain alternative names in check lines
---
 llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
index 70020949faa81..bf70545ff23c6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
@@ -1,25 +1,25 @@
 ; RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
 
 image_atomic_add v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-// GFX12: encoding: [0x00,0x00,0x43,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12: image_atomic_add_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0x43,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
 image_atomic_sub v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-// GFX12: encoding: [0x00,0x40,0x43,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12: image_atomic_sub_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0x43,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
 image_atomic_smin v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-// GFX12: encoding: [0x00,0x80,0x43,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12: image_atomic_min_int v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0x43,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
 image_atomic_umin v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-// GFX12: encoding: [0x00,0xc0,0x43,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12: image_atomic_min_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0x43,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
 image_atomic_smax v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-// GFX12: encoding: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12: image_atomic_max_int v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
 image_atomic_umax v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-// GFX12: encoding: [0x00,0x40,0x44,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12: image_atomic_max_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0x44,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
 image_atomic_inc v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-// GFX12: encoding: [0x00,0x40,0x45,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12: image_atomic_inc_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0x45,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
 image_atomic_dec v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-// GFX12: encoding: [0x00,0x80,0x45,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12: image_atomic_dec_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0x45,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]

From 901a8160a8cb3bda3558fffb8e109485c42affe5 Mon Sep 17 00:00:00 2001
From: Pol M <polmarcetsarda@gmail.com>
Date: Thu, 4 Jan 2024 15:42:35 +0100
Subject: [PATCH 237/313] [clang] Optimize castToDeclContext for 2% improvement
 in build times (#76825)

Optimize castToDeclContext for 2% improvement in build times
castToDeclContext is a heavily used function, and as such, it needs to
be kept as slim as feasible to preserve as much performance as possible.
To this end, it was observed that the function was generating suboptimal
assembly code, and putting the most common execution path in the longest
sequence of instructions. This patch addresses this by guiding the
compiler towards generating a lookup table of offsets, which can be used
to perform an addition on the pointer. This results in a 1-2%
improvement on debug builds (and a negligible improvement on release).

To achieve this, the switch was simplified to flatten the if statements
in the default branch. In order to make the values of the switch more
compact, encouraging LLVM to generate a look-up table instead of a jump
table, the AST TableGen generator was modified so it can take order
priority based on class inheritance. This combination allowed for a more
optimal generation of the function. Of note, 2 other functions with an
equivalent structure also needed to be modified.

Fixes #76824
---
 clang/include/clang/AST/DeclCXX.h             |   8 ++
 clang/lib/AST/DeclBase.cpp                    |  50 +++------
 clang/utils/TableGen/ClangASTNodesEmitter.cpp | 103 +++++++++++-------
 clang/utils/TableGen/TableGen.cpp             |   3 +-
 clang/utils/TableGen/TableGenBackends.h       |  10 +-
 5 files changed, 97 insertions(+), 77 deletions(-)

diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 432293583576b..984a4d8bab5e7 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -2044,6 +2044,14 @@ class RequiresExprBodyDecl : public Decl, public DeclContext {
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) { return K == RequiresExprBody; }
+
+  static DeclContext *castToDeclContext(const RequiresExprBodyDecl *D) {
+    return static_cast<DeclContext *>(const_cast<RequiresExprBodyDecl *>(D));
+  }
+
+  static RequiresExprBodyDecl *castFromDeclContext(const DeclContext *DC) {
+    return static_cast<RequiresExprBodyDecl *>(const_cast<DeclContext *>(DC));
+  }
 };
 
 /// Represents a static or instance method of a struct/union/class.
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 5e03f0223d311..b1733c2d052a6 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -930,20 +930,14 @@ const AttrVec &Decl::getAttrs() const {
 
 Decl *Decl::castFromDeclContext (const DeclContext *D) {
   Decl::Kind DK = D->getDeclKind();
-  switch(DK) {
-#define DECL(NAME, BASE)
-#define DECL_CONTEXT(NAME) \
-    case Decl::NAME:       \
-      return static_cast<NAME##Decl *>(const_cast<DeclContext *>(D));
-#define DECL_CONTEXT_BASE(NAME)
-#include "clang/AST/DeclNodes.inc"
-    default:
+  switch (DK) {
 #define DECL(NAME, BASE)
-#define DECL_CONTEXT_BASE(NAME)                  \
-      if (DK >= first##NAME && DK <= last##NAME) \
-        return static_cast<NAME##Decl *>(const_cast<DeclContext *>(D));
+#define DECL_CONTEXT(NAME)                                                     \
+  case Decl::NAME:                                                             \
+    return static_cast<NAME##Decl *>(const_cast<DeclContext *>(D));
 #include "clang/AST/DeclNodes.inc"
-      llvm_unreachable("a decl that inherits DeclContext isn't handled");
+  default:
+    llvm_unreachable("a decl that inherits DeclContext isn't handled");
   }
 }
 
@@ -951,18 +945,12 @@ DeclContext *Decl::castToDeclContext(const Decl *D) {
   Decl::Kind DK = D->getKind();
   switch(DK) {
 #define DECL(NAME, BASE)
-#define DECL_CONTEXT(NAME) \
-    case Decl::NAME:       \
-      return static_cast<NAME##Decl *>(const_cast<Decl *>(D));
-#define DECL_CONTEXT_BASE(NAME)
+#define DECL_CONTEXT(NAME)                                                     \
+  case Decl::NAME:                                                             \
+    return static_cast<NAME##Decl *>(const_cast<Decl *>(D));
 #include "clang/AST/DeclNodes.inc"
-    default:
-#define DECL(NAME, BASE)
-#define DECL_CONTEXT_BASE(NAME)                                   \
-      if (DK >= first##NAME && DK <= last##NAME)                  \
-        return static_cast<NAME##Decl *>(const_cast<Decl *>(D));
-#include "clang/AST/DeclNodes.inc"
-      llvm_unreachable("a decl that inherits DeclContext isn't handled");
+  default:
+    llvm_unreachable("a decl that inherits DeclContext isn't handled");
   }
 }
 
@@ -1129,20 +1117,14 @@ DeclContext::DeclContext(Decl::Kind K) {
 }
 
 bool DeclContext::classof(const Decl *D) {
-  switch (D->getKind()) {
+  Decl::Kind DK = D->getKind();
+  switch (DK) {
 #define DECL(NAME, BASE)
 #define DECL_CONTEXT(NAME) case Decl::NAME:
-#define DECL_CONTEXT_BASE(NAME)
 #include "clang/AST/DeclNodes.inc"
-      return true;
-    default:
-#define DECL(NAME, BASE)
-#define DECL_CONTEXT_BASE(NAME)                 \
-      if (D->getKind() >= Decl::first##NAME &&  \
-          D->getKind() <= Decl::last##NAME)     \
-        return true;
-#include "clang/AST/DeclNodes.inc"
-      return false;
+    return true;
+  default:
+    return false;
   }
 }
 
diff --git a/clang/utils/TableGen/ClangASTNodesEmitter.cpp b/clang/utils/TableGen/ClangASTNodesEmitter.cpp
index 16a1c74b9d91a..07ddafce32916 100644
--- a/clang/utils/TableGen/ClangASTNodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangASTNodesEmitter.cpp
@@ -33,6 +33,7 @@ class ClangASTNodesEmitter {
   typedef std::multimap<ASTNode, ASTNode> ChildMap;
   typedef ChildMap::const_iterator ChildIterator;
 
+  std::set<ASTNode> PrioritizedClasses;
   RecordKeeper &Records;
   ASTNode Root;
   const std::string &NodeClassName;
@@ -70,8 +71,16 @@ class ClangASTNodesEmitter {
   std::pair<ASTNode, ASTNode> EmitNode(raw_ostream& OS, ASTNode Base);
 public:
   explicit ClangASTNodesEmitter(RecordKeeper &R, const std::string &N,
-                                const std::string &S)
-    : Records(R), NodeClassName(N), BaseSuffix(S) {}
+                                const std::string &S,
+                                std::string_view PriorizeIfSubclassOf)
+      : Records(R), NodeClassName(N), BaseSuffix(S) {
+    auto vecPrioritized =
+        PriorizeIfSubclassOf.empty()
+            ? std::vector<Record *>{}
+            : R.getAllDerivedDefinitions(PriorizeIfSubclassOf);
+    PrioritizedClasses =
+        std::set<ASTNode>(vecPrioritized.begin(), vecPrioritized.end());
+  }
 
   // run - Output the .inc file contents
   void run(raw_ostream &OS);
@@ -95,8 +104,23 @@ std::pair<ASTNode, ASTNode> ClangASTNodesEmitter::EmitNode(raw_ostream &OS,
   if (!Base.isAbstract())
     First = Last = Base;
 
+  auto comp = [this](ASTNode LHS, ASTNode RHS) {
+    auto LHSPrioritized = PrioritizedClasses.count(LHS) > 0;
+    auto RHSPrioritized = PrioritizedClasses.count(RHS) > 0;
+    if (LHSPrioritized && !RHSPrioritized)
+      return true;
+    if (!LHSPrioritized && RHSPrioritized)
+      return false;
+
+    return LHS.getName() > RHS.getName();
+  };
+  auto SortedChildren = std::set<ASTNode, decltype(comp)>(comp);
+
   for (; i != e; ++i) {
-    ASTNode Child = i->second;
+    SortedChildren.insert(i->second);
+  }
+
+  for (const auto &Child : SortedChildren) {
     bool Abstract = Child.isAbstract();
     std::string NodeName = macroName(std::string(Child.getName()));
 
@@ -148,9 +172,7 @@ void ClangASTNodesEmitter::deriveChildTree() {
   const std::vector<Record*> Stmts
     = Records.getAllDerivedDefinitions(NodeClassName);
 
-  for (unsigned i = 0, e = Stmts.size(); i != e; ++i) {
-    Record *R = Stmts[i];
-
+  for (auto *R : Stmts) {
     if (auto B = R->getValueAsOptionalDef(BaseFieldName))
       Tree.insert(std::make_pair(B, R));
     else if (Root)
@@ -182,9 +204,9 @@ void ClangASTNodesEmitter::run(raw_ostream &OS) {
   OS << "#endif\n\n";
 
   OS << "#ifndef LAST_" << macroHierarchyName() << "_RANGE\n";
-  OS << "#  define LAST_" 
-     << macroHierarchyName() << "_RANGE(Base, First, Last) " 
-     << macroHierarchyName() << "_RANGE(Base, First, Last)\n";
+  OS << "#  define LAST_" << macroHierarchyName()
+     << "_RANGE(Base, First, Last) " << macroHierarchyName()
+     << "_RANGE(Base, First, Last)\n";
   OS << "#endif\n\n";
 
   EmitNode(OS, Root);
@@ -196,8 +218,20 @@ void ClangASTNodesEmitter::run(raw_ostream &OS) {
 }
 
 void clang::EmitClangASTNodes(RecordKeeper &RK, raw_ostream &OS,
-                              const std::string &N, const std::string &S) {
-  ClangASTNodesEmitter(RK, N, S).run(OS);
+                              const std::string &N, const std::string &S,
+                              std::string_view PriorizeIfSubclassOf) {
+  ClangASTNodesEmitter(RK, N, S, PriorizeIfSubclassOf).run(OS);
+}
+
+void printDeclContext(const std::multimap<Record *, Record *> &Tree,
+                      Record *DeclContext, raw_ostream &OS) {
+  if (!DeclContext->getValueAsBit(AbstractFieldName))
+    OS << "DECL_CONTEXT(" << DeclContext->getName() << ")\n";
+  auto i = Tree.lower_bound(DeclContext);
+  auto end = Tree.upper_bound(DeclContext);
+  for (; i != end; ++i) {
+    printDeclContext(Tree, i->second, OS);
+  }
 }
 
 // Emits and addendum to a .inc file to enumerate the clang declaration
@@ -210,38 +244,25 @@ void clang::EmitClangDeclContext(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#ifndef DECL_CONTEXT\n";
   OS << "#  define DECL_CONTEXT(DECL)\n";
   OS << "#endif\n";
-  
-  OS << "#ifndef DECL_CONTEXT_BASE\n";
-  OS << "#  define DECL_CONTEXT_BASE(DECL) DECL_CONTEXT(DECL)\n";
-  OS << "#endif\n";
-  
-  typedef std::set<Record*> RecordSet;
-  typedef std::vector<Record*> RecordVector;
-  
-  RecordVector DeclContextsVector
-    = Records.getAllDerivedDefinitions(DeclContextNodeClassName);
-  RecordVector Decls = Records.getAllDerivedDefinitions(DeclNodeClassName);
-  RecordSet DeclContexts (DeclContextsVector.begin(), DeclContextsVector.end());
-   
-  for (RecordVector::iterator i = Decls.begin(), e = Decls.end(); i != e; ++i) {
-    Record *R = *i;
-
-    if (Record *B = R->getValueAsOptionalDef(BaseFieldName)) {
-      if (DeclContexts.find(B) != DeclContexts.end()) {
-        OS << "DECL_CONTEXT_BASE(" << B->getName() << ")\n";
-        DeclContexts.erase(B);
-      }
-    }
+
+  std::vector<Record *> DeclContextsVector =
+      Records.getAllDerivedDefinitions(DeclContextNodeClassName);
+  std::vector<Record *> Decls =
+      Records.getAllDerivedDefinitions(DeclNodeClassName);
+
+  std::multimap<Record *, Record *> Tree;
+
+  const std::vector<Record *> Stmts =
+      Records.getAllDerivedDefinitions(DeclNodeClassName);
+
+  for (auto *R : Stmts) {
+    if (auto *B = R->getValueAsOptionalDef(BaseFieldName))
+      Tree.insert(std::make_pair(B, R));
   }
 
-  // To keep identical order, RecordVector may be used
-  // instead of RecordSet.
-  for (RecordVector::iterator
-         i = DeclContextsVector.begin(), e = DeclContextsVector.end();
-       i != e; ++i)
-    if (DeclContexts.find(*i) != DeclContexts.end())
-      OS << "DECL_CONTEXT(" << (*i)->getName() << ")\n";
+  for (auto *DeclContext : DeclContextsVector) {
+    printDeclContext(Tree, DeclContext, OS);
+  }
 
   OS << "#undef DECL_CONTEXT\n";
-  OS << "#undef DECL_CONTEXT_BASE\n";
 }
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index c1f2ca15b595c..3859555d647fd 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -398,7 +398,8 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
     EmitClangASTNodes(Records, OS, CommentNodeClassName, "");
     break;
   case GenClangDeclNodes:
-    EmitClangASTNodes(Records, OS, DeclNodeClassName, "Decl");
+    EmitClangASTNodes(Records, OS, DeclNodeClassName, "Decl",
+                      DeclContextNodeClassName);
     EmitClangDeclContext(Records, OS);
     break;
   case GenClangStmtNodes:
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 35f2f04c1e818..faa0c5d2cff9e 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -25,8 +25,16 @@ class RecordKeeper;
 namespace clang {
 
 void EmitClangDeclContext(llvm::RecordKeeper &RK, llvm::raw_ostream &OS);
+/**
+  @param PriorizeIfSubclassOf These classes should be prioritized in the output.
+  This is useful to force enum generation/jump tables/lookup tables to be more
+  compact in both size and surrounding code in hot functions. An example use is
+  in Decl for classes that inherit from DeclContext, for functions like
+  castFromDeclContext.
+  */
 void EmitClangASTNodes(llvm::RecordKeeper &RK, llvm::raw_ostream &OS,
-                       const std::string &N, const std::string &S);
+                       const std::string &N, const std::string &S,
+                       std::string_view PriorizeIfSubclassOf = "");
 void EmitClangBasicReader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitClangBasicWriter(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitClangTypeNodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);

From 96c23ebd3b28f034784eab66393ea9a46e45c6ee Mon Sep 17 00:00:00 2001
From: Simon Camphausen <simon.camphausen@iml.fraunhofer.de>
Date: Thu, 4 Jan 2024 15:43:33 +0100
Subject: [PATCH 238/313] [mlir][EmitC] Use declarative assembly format for
 opaque types and attributes (#76066)

The parser and printer of string attributes were changed to handle
escape sequences. Therefore, we no longer require a custom parser and
printer. Verification is moved from the parser to the verifier
accordingly.
---
 .../mlir/Dialect/EmitC/IR/EmitCAttributes.td  |  3 +-
 .../mlir/Dialect/EmitC/IR/EmitCTypes.td       |  3 +-
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp           | 49 +++----------------
 3 files changed, 11 insertions(+), 44 deletions(-)

diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCAttributes.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitCAttributes.td
index ae843e49c6c5b..ea5e9efd5fa0b 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitCAttributes.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitCAttributes.td
@@ -57,8 +57,7 @@ def EmitC_OpaqueAttr : EmitC_Attr<"Opaque", "opaque"> {
   }];
 
   let parameters = (ins StringRefParameter<"the opaque value">:$value);
-
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = "`<` $value `>`";
 }
 
 def EmitC_OpaqueOrTypedAttr : AnyAttrOf<[EmitC_OpaqueAttr, TypedAttrInterface]>;
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td
index 7874aa2c9e304..8818c049ed771 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td
@@ -42,7 +42,8 @@ def EmitC_OpaqueType : EmitC_Type<"Opaque", "opaque"> {
   }];
 
   let parameters = (ins StringRefParameter<"the opaque value">:$value);
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = "`<` $value `>`";
+  let genVerifyDecl = 1;
 }
 
 def EmitC_PointerType : EmitC_Type<"Pointer", "ptr"> {
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 8508d29d2306a..5f502f1f7a171 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -621,27 +621,6 @@ LogicalResult emitc::YieldOp::verify() {
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/EmitC/IR/EmitCAttributes.cpp.inc"
 
-Attribute emitc::OpaqueAttr::parse(AsmParser &parser, Type type) {
-  if (parser.parseLess())
-    return Attribute();
-  std::string value;
-  SMLoc loc = parser.getCurrentLocation();
-  if (parser.parseOptionalString(&value)) {
-    parser.emitError(loc) << "expected string";
-    return Attribute();
-  }
-  if (parser.parseGreater())
-    return Attribute();
-
-  return get(parser.getContext(), value);
-}
-
-void emitc::OpaqueAttr::print(AsmPrinter &printer) const {
-  printer << "<\"";
-  llvm::printEscapedString(getValue(), printer.getStream());
-  printer << "\">";
-}
-
 //===----------------------------------------------------------------------===//
 // EmitC Types
 //===----------------------------------------------------------------------===//
@@ -653,27 +632,15 @@ void emitc::OpaqueAttr::print(AsmPrinter &printer) const {
 // OpaqueType
 //===----------------------------------------------------------------------===//
 
-Type emitc::OpaqueType::parse(AsmParser &parser) {
-  if (parser.parseLess())
-    return Type();
-  std::string value;
-  SMLoc loc = parser.getCurrentLocation();
-  if (parser.parseOptionalString(&value) || value.empty()) {
-    parser.emitError(loc) << "expected non empty string in !emitc.opaque type";
-    return Type();
+LogicalResult mlir::emitc::OpaqueType::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+    llvm::StringRef value) {
+  if (value.empty()) {
+    return emitError() << "expected non empty string in !emitc.opaque type";
   }
   if (value.back() == '*') {
-    parser.emitError(loc) << "pointer not allowed as outer type with "
-                             "!emitc.opaque, use !emitc.ptr instead";
-    return Type();
+    return emitError() << "pointer not allowed as outer type with "
+                          "!emitc.opaque, use !emitc.ptr instead";
   }
-  if (parser.parseGreater())
-    return Type();
-  return get(parser.getContext(), value);
-}
-
-void emitc::OpaqueType::print(AsmPrinter &printer) const {
-  printer << "<\"";
-  llvm::printEscapedString(getValue(), printer.getStream());
-  printer << "\">";
+  return success();
 }

From 79e62315be5b762f399e98c0b638e682eac66322 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 4 Jan 2024 06:35:16 -0800
Subject: [PATCH 239/313] [SLP]Use revectorized value for extracts from
 buildvector, beeing vectorized.

When trying to reuse the extractelement instruction, emitted for the
insertelement instruction, need to check, if the this insertelement
instruction was vectorized. In this case, need to use vectorized value,
not the original insertelement.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  6 ++-
 .../X86/gather_extract_from_vectorbuild.ll    | 45 +++++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bd89ec09671a9..cbe767537a1d6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11905,8 +11905,10 @@ Value *BoUpSLP::vectorizeTree(
         if (!Ex) {
           // "Reuse" the existing extract to improve final codegen.
           if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
-            Ex = Builder.CreateExtractElement(ES->getOperand(0),
-                                              ES->getOperand(1));
+            Value *V = ES->getVectorOperand();
+            if (const TreeEntry *ETE = getTreeEntry(V))
+              V = ETE->VectorizedValue;
+            Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
           } else {
             Ex = Builder.CreateExtractElement(Vec, Lane);
           }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather_extract_from_vectorbuild.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather_extract_from_vectorbuild.ll
index b762c3a4f1858..dd5c52b99d00e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather_extract_from_vectorbuild.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather_extract_from_vectorbuild.ll
@@ -58,3 +58,48 @@ loop:
   %i5 = extractelement <2 x float> %ins1, i64 1
   br label %loop
 }
+
+define void @test2() {
+; CHECK-LABEL: define void @test2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PH:%.*]] = phi float [ poison, [[BB2:%.*]] ], [ [[TMP3:%.*]], [[LOOP:%.*]] ]
+; CHECK-NEXT:    unreachable
+; CHECK:       bb2:
+; CHECK-NEXT:    br i1 poison, label [[BB3]], label [[BB1:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[BB3]] ], [ [[TMP2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[TMP2]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP1]], <2 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP3]] = extractelement <2 x float> [[TMP2]], i64 1
+; CHECK-NEXT:    br i1 poison, label [[BB1]], label [[LOOP]]
+;
+entry:
+  br label %bb3
+
+bb1:
+  %ph = phi float [ poison, %bb2 ], [ %i5, %loop ]
+  unreachable
+
+bb2:
+  br i1 poison, label %bb3, label %bb1
+
+bb3:
+  br label %loop
+
+loop:
+  %ph0 = phi float [ 0.000000e+00, %bb3 ], [ %i4, %loop ]
+  %ph1 = phi float [ 0.000000e+00, %bb3 ], [ %i5, %loop ]
+  %i = fadd float 0.000000e+00, %ph0
+  %i1 = fadd float 0.000000e+00, %ph1
+  %i2 = select i1 false, float %i, float 0.000000e+00
+  %i3 = select i1 false, float %i1, float 0.000000e+00
+  %ins0 = insertelement <2 x float> zeroinitializer, float %i2, i64 0
+  %ins1 = insertelement <2 x float> %ins0, float %i3, i64 1
+  %i4 = extractelement <2 x float> %ins1, i64 0
+  %i5 = extractelement <2 x float> %ins1, i64 1
+  br i1 poison, label %bb1, label %loop
+}

From d02471ede56e727d7d16f61bcdecc4f99a2bda02 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 4 Jan 2024 16:14:28 +0100
Subject: [PATCH 240/313] [libc][NFC] Simplify `FPBits` (#76835)

This patch reduces the scope of `FPBits` exported variables and
functions.
It also moves storage up into `FPRep` and tries to make the default and
specialized versions of `FPBits` more uniform.

The next step is to move the specialization from `FPBits` to `FPRep` so
we can manipulate floating point representations through `FPType`
alone - that is - independently from the host architecture.
---
 libc/src/__support/FPUtil/FPBits.h            | 204 +++++++++---------
 libc/src/__support/FPUtil/fpbits_str.h        |   2 +-
 .../__support/FPUtil/x86_64/LongDoubleBits.h  | 140 ++++++------
 libc/src/math/generic/log.cpp                 |   2 +-
 libc/src/math/generic/log10.cpp               |   2 +-
 libc/src/math/generic/log10f.cpp              |   2 +-
 libc/src/math/generic/log2.cpp                |   2 +-
 libc/src/math/generic/log2f.cpp               |   2 +-
 libc/src/math/generic/logf.cpp                |   2 +-
 libc/test/src/stdlib/strtold_test.cpp         |   2 +-
 libc/test/src/time/difftime_test.cpp          |   2 +-
 11 files changed, 172 insertions(+), 190 deletions(-)

diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 8304b76d3d8ad..63eeba85f15e5 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -33,12 +33,6 @@ enum class FPType {
 
 namespace internal {
 
-// The type of encoding for supported floating point types.
-enum class FPEncoding {
-  IEEE754,
-  X86_ExtendedPrecision,
-};
-
 // Defines the layout (sign, exponent, significand) of a floating point type in
 // memory. It also defines its associated StorageType, i.e., the unsigned
 // integer type used to manipulate its representation.
@@ -49,7 +43,6 @@ template <> struct FPLayout<FPType::IEEE754_Binary16> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 5;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 10;
-  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary32> {
@@ -57,7 +50,6 @@ template <> struct FPLayout<FPType::IEEE754_Binary32> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 8;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 23;
-  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary64> {
@@ -65,7 +57,6 @@ template <> struct FPLayout<FPType::IEEE754_Binary64> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 11;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 52;
-  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary128> {
@@ -73,7 +64,6 @@ template <> struct FPLayout<FPType::IEEE754_Binary128> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 112;
-  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
 };
 
 template <> struct FPLayout<FPType::X86_Binary80> {
@@ -81,15 +71,13 @@ template <> struct FPLayout<FPType::X86_Binary80> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 64;
-  LIBC_INLINE_VAR static constexpr auto ENCODING =
-      FPEncoding::X86_ExtendedPrecision;
 };
 
 } // namespace internal
 
-// FPBaseMasksAndShifts derives useful constants from the FPLayout.
+// FPRepBase derives useful constants from the FPLayout.
 template <FPType fp_type>
-struct FPBaseMasksAndShifts : public internal::FPLayout<fp_type> {
+struct FPRepBase : public internal::FPLayout<fp_type> {
 private:
   using UP = internal::FPLayout<fp_type>;
 
@@ -149,73 +137,44 @@ struct FPBaseMasksAndShifts : public internal::FPLayout<fp_type> {
     return StorageType(1) << position;
   }
 
-public:
+  // Merge bits from 'a' and 'b' values according to 'mask'.
+  // Use 'a' bits when corresponding 'mask' bits are zeroes and 'b' bits when
+  // corresponding bits are ones.
+  LIBC_INLINE static constexpr StorageType merge(StorageType a, StorageType b,
+                                                 StorageType mask) {
+    // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
+    return a ^ ((a ^ b) & mask);
+  }
+
+protected:
   // The number of bits after the decimal dot when the number is in normal form.
   LIBC_INLINE_VAR static constexpr int FRACTION_LEN =
-      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision ? SIG_LEN - 1
-                                                                  : SIG_LEN;
+      fp_type == FPType::X86_Binary80 ? SIG_LEN - 1 : SIG_LEN;
   LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
       FRACTION_LEN + 1;
   LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
       mask_trailing_ones<StorageType, FRACTION_LEN>();
 
-protected:
   // If a number x is a NAN, then it is a quiet NAN if:
   //   QUIET_NAN_MASK & bits(x) != 0
   LIBC_INLINE_VAR static constexpr StorageType QUIET_NAN_MASK =
-      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision
+      fp_type == FPType::X86_Binary80
           ? bit_at(SIG_LEN - 1) | bit_at(SIG_LEN - 2) // 0b1100...
           : bit_at(SIG_LEN - 1);                      // 0b1000...
 
   // If a number x is a NAN, then it is a signalling NAN if:
   //   SIGNALING_NAN_MASK & bits(x) != 0
   LIBC_INLINE_VAR static constexpr StorageType SIGNALING_NAN_MASK =
-      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision
+      fp_type == FPType::X86_Binary80
           ? bit_at(SIG_LEN - 1) | bit_at(SIG_LEN - 3) // 0b1010...
           : bit_at(SIG_LEN - 2);                      // 0b0100...
-};
-
-namespace internal {
 
-// This is a temporary class to unify common methods and properties between
-// FPBits and FPBits<long double>.
-template <FPType fp_type> struct FPRep : private FPBaseMasksAndShifts<fp_type> {
-  using UP = FPBaseMasksAndShifts<fp_type>;
-  using typename UP::StorageType;
-  using UP::TOTAL_LEN;
-
-protected:
-  using UP::EXP_SIG_MASK;
-  using UP::QUIET_NAN_MASK;
+  // The floating point number representation as an unsigned integer.
+  StorageType bits = 0;
 
 public:
-  using UP::EXP_BIAS;
-  using UP::EXP_LEN;
-  using UP::EXP_MASK;
-  using UP::EXP_MASK_SHIFT;
-  using UP::FP_MASK;
-  using UP::FRACTION_LEN;
-  using UP::FRACTION_MASK;
-  using UP::MANTISSA_PRECISION;
-  using UP::SIGN_MASK;
-  using UP::STORAGE_LEN;
-
-  // Reinterpreting bits as an integer value and interpreting the bits of an
-  // integer value as a floating point value is used in tests. So, a convenient
-  // type is provided for such reinterpretations.
-  StorageType bits;
-
-  LIBC_INLINE constexpr FPRep() : bits(0) {}
-  LIBC_INLINE explicit constexpr FPRep(StorageType bits) : bits(bits) {}
-
-  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
-    mantVal &= FRACTION_MASK;
-    bits &= ~FRACTION_MASK;
-    bits |= mantVal;
-  }
-
-  LIBC_INLINE constexpr StorageType get_mantissa() const {
-    return bits & FRACTION_MASK;
+  LIBC_INLINE constexpr bool get_sign() const {
+    return (bits & SIGN_MASK) != 0;
   }
 
   LIBC_INLINE constexpr void set_sign(bool signVal) {
@@ -223,21 +182,22 @@ template <FPType fp_type> struct FPRep : private FPBaseMasksAndShifts<fp_type> {
       bits ^= SIGN_MASK;
   }
 
-  LIBC_INLINE constexpr bool get_sign() const {
-    return (bits & SIGN_MASK) != 0;
+  LIBC_INLINE constexpr StorageType get_mantissa() const {
+    return bits & FRACTION_MASK;
   }
 
-  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
-    // clear exponent bits
-    bits &= ~EXP_MASK;
-    // set exponent bits
-    bits |= (biased << EXP_MASK_SHIFT) & EXP_MASK;
+  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
+    bits = merge(bits, mantVal, FRACTION_MASK);
   }
 
   LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
     return uint16_t((bits & EXP_MASK) >> EXP_MASK_SHIFT);
   }
 
+  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
+    bits = merge(bits, biased << EXP_MASK_SHIFT, EXP_MASK);
+  }
+
   LIBC_INLINE constexpr int get_exponent() const {
     return int(get_biased_exponent()) - EXP_BIAS;
   }
@@ -266,6 +226,23 @@ template <FPType fp_type> struct FPRep : private FPBaseMasksAndShifts<fp_type> {
   }
 };
 
+namespace internal {
+
+// Manipulates the representation of a floating point number defined by its
+// FPType. This layer is architecture agnostic and does not handle C++ floating
+// point types directly ('float', 'double' and 'long double'). Use the FPBits
+// below if needed.
+//
+// TODO: Specialize this class for FPType::X86_Binary80 and remove ad-hoc logic
+// from FPRepBase.
+template <FPType fp_type> struct FPRep : public FPRepBase<fp_type> {
+  using UP = FPRepBase<fp_type>;
+  using typename UP::StorageType;
+  using UP::FRACTION_LEN;
+  using UP::FRACTION_MASK;
+  using UP::MANTISSA_PRECISION;
+};
+
 } // namespace internal
 
 // Returns the FPType corresponding to C++ type T on the host.
@@ -311,14 +288,16 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
   static_assert(cpp::is_floating_point_v<T>,
                 "FPBits instantiated with invalid type.");
   using UP = internal::FPRep<get_fp_type<T>()>;
-  using StorageType = typename UP::StorageType;
-  using UP::bits;
 
 private:
   using UP::EXP_SIG_MASK;
   using UP::QUIET_NAN_MASK;
+  using UP::SIG_LEN;
+  using UP::SIG_MASK;
 
 public:
+  using StorageType = typename UP::StorageType;
+  using UP::bits;
   using UP::EXP_BIAS;
   using UP::EXP_LEN;
   using UP::EXP_MASK;
@@ -327,46 +306,47 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
   using UP::FRACTION_MASK;
   using UP::SIGN_MASK;
   using UP::TOTAL_LEN;
+  using UP::UP;
 
   using UP::get_biased_exponent;
   using UP::is_zero;
-
-  // The function return mantissa with the implicit bit set iff the current
-  // value is a valid normal number.
-  LIBC_INLINE constexpr StorageType get_explicit_mantissa() {
-    return ((get_biased_exponent() > 0 && !is_inf_or_nan())
-                ? (FRACTION_MASK + 1)
-                : 0) |
-           (FRACTION_MASK & bits);
-  }
-
+  // Constants.
   static constexpr int MAX_BIASED_EXPONENT = (1 << EXP_LEN) - 1;
   static constexpr StorageType MIN_SUBNORMAL = StorageType(1);
   static constexpr StorageType MAX_SUBNORMAL = FRACTION_MASK;
   static constexpr StorageType MIN_NORMAL = (StorageType(1) << FRACTION_LEN);
   static constexpr StorageType MAX_NORMAL =
-      ((StorageType(MAX_BIASED_EXPONENT) - 1) << FRACTION_LEN) | MAX_SUBNORMAL;
-
-  // We don't want accidental type promotions/conversions, so we require exact
-  // type match.
-  template <typename XType, cpp::enable_if_t<cpp::is_same_v<T, XType>, int> = 0>
-  LIBC_INLINE constexpr explicit FPBits(XType x)
-      : UP(cpp::bit_cast<StorageType>(x)) {}
+      (StorageType(MAX_BIASED_EXPONENT - 1) << SIG_LEN) | SIG_MASK;
 
-  template <typename XType,
-            cpp::enable_if_t<cpp::is_same_v<XType, StorageType>, int> = 0>
-  LIBC_INLINE constexpr explicit FPBits(XType x) : UP(x) {}
+  // Constructors.
+  LIBC_INLINE constexpr FPBits() = default;
 
-  LIBC_INLINE constexpr FPBits() : UP() {}
-
-  LIBC_INLINE constexpr void set_val(T value) {
-    bits = cpp::bit_cast<StorageType>(value);
+  template <typename XType> LIBC_INLINE constexpr explicit FPBits(XType x) {
+    using Unqual = typename cpp::remove_cv_t<XType>;
+    if constexpr (cpp::is_same_v<Unqual, T>) {
+      bits = cpp::bit_cast<StorageType>(x);
+    } else if constexpr (cpp::is_same_v<Unqual, StorageType>) {
+      bits = x;
+    } else {
+      // We don't want accidental type promotions/conversions, so we require
+      // exact type match.
+      static_assert(cpp::always_false<XType>);
+    }
   }
-
+  // Floating-point conversions.
   LIBC_INLINE constexpr T get_val() const { return cpp::bit_cast<T>(bits); }
 
   LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
+  // The function return mantissa with the implicit bit set iff the current
+  // value is a valid normal number.
+  LIBC_INLINE constexpr StorageType get_explicit_mantissa() {
+    return ((get_biased_exponent() > 0 && !is_inf_or_nan())
+                ? (FRACTION_MASK + 1)
+                : 0) |
+           (FRACTION_MASK & bits);
+  }
+
   LIBC_INLINE constexpr bool is_inf() const {
     return (bits & EXP_SIG_MASK) == EXP_MASK;
   }
@@ -387,14 +367,22 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
     return FPBits(bits & EXP_SIG_MASK);
   }
 
+  // Methods below this are used by tests.
+
   LIBC_INLINE static constexpr T zero(bool sign = false) {
-    return FPBits(sign ? SIGN_MASK : StorageType(0)).get_val();
+    StorageType rep = (sign ? SIGN_MASK : StorageType(0)) // sign
+                      | 0                                 // exponent
+                      | 0;                                // mantissa
+    return FPBits(rep).get_val();
   }
 
   LIBC_INLINE static constexpr T neg_zero() { return zero(true); }
 
   LIBC_INLINE static constexpr T inf(bool sign = false) {
-    return FPBits((sign ? SIGN_MASK : StorageType(0)) | EXP_MASK).get_val();
+    StorageType rep = (sign ? SIGN_MASK : StorageType(0)) // sign
+                      | EXP_MASK                          // exponent
+                      | 0;                                // mantissa
+    return FPBits(rep).get_val();
   }
 
   LIBC_INLINE static constexpr T neg_inf() { return inf(true); }
@@ -416,15 +404,24 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
   }
 
   LIBC_INLINE static constexpr T build_nan(StorageType v) {
-    FPBits<T> bits(inf());
-    bits.set_mantissa(v);
-    return T(bits);
+    StorageType rep = 0                      // sign
+                      | EXP_MASK             // exponent
+                      | (v & FRACTION_MASK); // mantissa
+    return FPBits(rep).get_val();
   }
 
   LIBC_INLINE static constexpr T build_quiet_nan(StorageType v) {
     return build_nan(QUIET_NAN_MASK | v);
   }
 
+  LIBC_INLINE static constexpr FPBits<T>
+  create_value(bool sign, StorageType biased_exp, StorageType mantissa) {
+    StorageType rep = (sign ? SIGN_MASK : StorageType(0))           // sign
+                      | ((biased_exp << EXP_MASK_SHIFT) & EXP_MASK) // exponent
+                      | (mantissa & FRACTION_MASK);                 // mantissa
+    return FPBits(rep);
+  }
+
   // The function convert integer number and unbiased exponent to proper float
   // T type:
   //   Result = number * 2^(ep+1 - exponent_bias)
@@ -452,15 +449,6 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
     }
     return result;
   }
-
-  LIBC_INLINE static constexpr FPBits<T>
-  create_value(bool sign, StorageType biased_exp, StorageType mantissa) {
-    FPBits<T> result;
-    result.set_sign(sign);
-    result.set_biased_exponent(biased_exp);
-    result.set_mantissa(mantissa);
-    return result;
-  }
 };
 
 } // namespace fputil
diff --git a/libc/src/__support/FPUtil/fpbits_str.h b/libc/src/__support/FPUtil/fpbits_str.h
index ce368c89f95ef..50019f32b2c44 100644
--- a/libc/src/__support/FPUtil/fpbits_str.h
+++ b/libc/src/__support/FPUtil/fpbits_str.h
@@ -45,7 +45,7 @@ template <typename T> LIBC_INLINE cpp::string str(fputil::FPBits<T> x) {
 
   cpp::string s;
 
-  const details::ZeroPaddedHexFmt<StorageType> bits(x.bits);
+  const details::ZeroPaddedHexFmt<StorageType> bits(x.uintval());
   s += bits.view();
 
   s += " = (S: ";
diff --git a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
index 4dc5d25e26982..8abc0c87af0d2 100644
--- a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
+++ b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
@@ -30,70 +30,69 @@ template <>
 struct FPBits<long double> : public internal::FPRep<FPType::X86_Binary80> {
   using UP = internal::FPRep<FPType::X86_Binary80>;
   using StorageType = typename UP::StorageType;
-  using UP::bits;
 
 private:
+  using UP::bits;
   using UP::EXP_SIG_MASK;
   using UP::QUIET_NAN_MASK;
 
 public:
-  using UP::EXP_BIAS;
-  using UP::EXP_LEN;
-  using UP::EXP_MASK;
-  using UP::EXP_MASK_SHIFT;
-  using UP::FP_MASK;
-  using UP::FRACTION_LEN;
-  using UP::FRACTION_MASK;
-  using UP::SIGN_MASK;
-  using UP::TOTAL_LEN;
-
-  static constexpr int MAX_BIASED_EXPONENT = 0x7FFF;
+  // Constants.
+  static constexpr int MAX_BIASED_EXPONENT = (1 << EXP_LEN) - 1;
+  // The x86 80 bit float represents the leading digit of the mantissa
+  // explicitly. This is the mask for that bit.
+  static constexpr StorageType EXPLICIT_BIT_MASK = StorageType(1)
+                                                   << FRACTION_LEN;
+  // The X80 significand is made of an explicit bit and the fractional part.
+  static_assert((EXPLICIT_BIT_MASK & FRACTION_MASK) == 0,
+                "the explicit bit and the fractional part should not overlap");
+  static_assert((EXPLICIT_BIT_MASK | FRACTION_MASK) == SIG_MASK,
+                "the explicit bit and the fractional part should cover the "
+                "whole significand");
   static constexpr StorageType MIN_SUBNORMAL = StorageType(1);
   // Subnormal numbers include the implicit bit in x86 long double formats.
-  static constexpr StorageType MAX_SUBNORMAL =
-      (StorageType(1) << FRACTION_LEN) - 1;
-  static constexpr StorageType MIN_NORMAL = (StorageType(3) << FRACTION_LEN);
+  static constexpr StorageType MAX_SUBNORMAL = FRACTION_MASK;
+  static constexpr StorageType MIN_NORMAL =
+      (StorageType(1) << SIG_LEN) | EXPLICIT_BIT_MASK;
   static constexpr StorageType MAX_NORMAL =
-      (StorageType(MAX_BIASED_EXPONENT - 1) << (FRACTION_LEN + 1)) |
-      (StorageType(1) << FRACTION_LEN) | MAX_SUBNORMAL;
-
-  LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
-    // The x86 80 bit float represents the leading digit of the mantissa
-    // explicitly. This is the mask for that bit.
-    constexpr StorageType EXPLICIT_BIT_MASK = StorageType(1) << FRACTION_LEN;
-    return bits & (FRACTION_MASK | EXPLICIT_BIT_MASK);
+      (StorageType(MAX_BIASED_EXPONENT - 1) << SIG_LEN) | SIG_MASK;
+
+  // Constructors.
+  LIBC_INLINE constexpr FPBits() = default;
+
+  template <typename XType> LIBC_INLINE constexpr explicit FPBits(XType x) {
+    using Unqual = typename cpp::remove_cv_t<XType>;
+    if constexpr (cpp::is_same_v<Unqual, long double>) {
+      bits = cpp::bit_cast<StorageType>(x);
+    } else if constexpr (cpp::is_same_v<Unqual, StorageType>) {
+      bits = x;
+    } else {
+      // We don't want accidental type promotions/conversions, so we require
+      // exact type match.
+      static_assert(cpp::always_false<XType>);
+    }
   }
 
-  LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
-    bits &= ~(StorageType(1) << FRACTION_LEN);
-    bits |= (StorageType(implicitVal) << FRACTION_LEN);
+  // Floating-point conversions.
+  LIBC_INLINE constexpr long double get_val() const {
+    return cpp::bit_cast<long double>(bits);
   }
 
-  LIBC_INLINE constexpr bool get_implicit_bit() const {
-    return bool((bits & (StorageType(1) << FRACTION_LEN)) >> FRACTION_LEN);
+  LIBC_INLINE constexpr operator long double() const {
+    return cpp::bit_cast<long double>(bits);
   }
 
-  LIBC_INLINE constexpr FPBits() : UP() {}
-
-  template <typename XType,
-            cpp::enable_if_t<cpp::is_same_v<long double, XType>, int> = 0>
-  LIBC_INLINE constexpr explicit FPBits(XType x)
-      : UP(cpp::bit_cast<StorageType>(x)) {
-    // bits starts uninitialized, and setting it to a long double only
-    // overwrites the first 80 bits. This clears those upper bits.
-    bits = bits & ((StorageType(1) << 80) - 1);
+  LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
+    return bits & SIG_MASK;
   }
 
-  template <typename XType,
-            cpp::enable_if_t<cpp::is_same_v<XType, StorageType>, int> = 0>
-  LIBC_INLINE constexpr explicit FPBits(XType x) : UP(x) {}
-
-  LIBC_INLINE constexpr operator long double() {
-    return cpp::bit_cast<long double>(bits);
+  LIBC_INLINE constexpr bool get_implicit_bit() const {
+    return bits & EXPLICIT_BIT_MASK;
   }
 
-  LIBC_INLINE constexpr long double get_val() const {
-    return cpp::bit_cast<long double>(bits);
+  LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
+    if (get_implicit_bit() != implicitVal)
+      bits ^= EXPLICIT_BIT_MASK;
   }
 
   LIBC_INLINE constexpr bool is_inf() const {
@@ -117,34 +116,26 @@ struct FPBits<long double> : public internal::FPRep<FPType::X86_Binary80> {
 
   // Methods below this are used by tests.
 
-  LIBC_INLINE static constexpr long double zero() { return 0.0l; }
+  LIBC_INLINE static constexpr long double zero(bool sign = false) {
+    StorageType rep = (sign ? SIGN_MASK : StorageType(0)) // sign
+                      | 0                                 // exponent
+                      | 0                                 // explicit bit
+                      | 0;                                // mantissa
+    return FPBits(rep).get_val();
+  }
 
-  LIBC_INLINE static constexpr long double neg_zero() { return -0.0l; }
+  LIBC_INLINE static constexpr long double neg_zero() { return zero(true); }
 
   LIBC_INLINE static constexpr long double inf(bool sign = false) {
-    FPBits<long double> bits(0.0l);
-    bits.set_biased_exponent(MAX_BIASED_EXPONENT);
-    bits.set_implicit_bit(1);
-    if (sign) {
-      bits.set_sign(true);
-    }
-    return bits.get_val();
+    StorageType rep = (sign ? SIGN_MASK : StorageType(0)) // sign
+                      | EXP_MASK                          // exponent
+                      | EXPLICIT_BIT_MASK                 // explicit bit
+                      | 0;                                // mantissa
+    return FPBits(rep).get_val();
   }
 
   LIBC_INLINE static constexpr long double neg_inf() { return inf(true); }
 
-  LIBC_INLINE static constexpr long double build_nan(StorageType v) {
-    FPBits<long double> bits(0.0l);
-    bits.set_biased_exponent(MAX_BIASED_EXPONENT);
-    bits.set_implicit_bit(1);
-    bits.set_mantissa(v);
-    return bits;
-  }
-
-  LIBC_INLINE static constexpr long double build_quiet_nan(StorageType v) {
-    return build_nan(QUIET_NAN_MASK | v);
-  }
-
   LIBC_INLINE static constexpr long double min_normal() {
     return FPBits(MIN_NORMAL).get_val();
   }
@@ -161,13 +152,16 @@ struct FPBits<long double> : public internal::FPRep<FPType::X86_Binary80> {
     return FPBits(MAX_SUBNORMAL).get_val();
   }
 
-  LIBC_INLINE static constexpr FPBits<long double>
-  create_value(bool sign, StorageType biased_exp, StorageType mantissa) {
-    FPBits<long double> result;
-    result.set_sign(sign);
-    result.set_biased_exponent(biased_exp);
-    result.set_mantissa(mantissa);
-    return result;
+  LIBC_INLINE static constexpr long double build_nan(StorageType v) {
+    StorageType rep = 0                      // sign
+                      | EXP_MASK             // exponent
+                      | EXPLICIT_BIT_MASK    // explicit bit
+                      | (v & FRACTION_MASK); // mantissa
+    return FPBits(rep).get_val();
+  }
+
+  LIBC_INLINE static constexpr long double build_quiet_nan(StorageType v) {
+    return build_nan(QUIET_NAN_MASK | v);
   }
 };
 
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index f82683bcc0554..8f22fdea93217 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -758,7 +758,7 @@ LLVM_LIBC_FUNCTION(double, log, (double x)) {
       return x;
     }
     // Normalize denormal inputs.
-    xbits.set_val(x * 0x1.0p52);
+    xbits = FPBits_t(x * 0x1.0p52);
     x_e -= 52;
     x_u = xbits.uintval();
   }
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index d2b94f22687fb..df82f24dc6967 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -759,7 +759,7 @@ LLVM_LIBC_FUNCTION(double, log10, (double x)) {
       return x;
     }
     // Normalize denormal inputs.
-    xbits.set_val(x * 0x1.0p52);
+    xbits = FPBits_t(x * 0x1.0p52);
     x_e -= 52;
     x_u = xbits.uintval();
   }
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index b70183958899c..da69f7f5ad4d9 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -177,7 +177,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
       return x;
     }
     // Normalize denormal inputs.
-    xbits.set_val(xbits.get_val() * 0x1.0p23f);
+    xbits = FPBits(xbits.get_val() * 0x1.0p23f);
     m -= 23;
     x_u = xbits.uintval();
   }
diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp
index a333a075fe5af..1427d1934db90 100644
--- a/libc/src/math/generic/log2.cpp
+++ b/libc/src/math/generic/log2.cpp
@@ -880,7 +880,7 @@ LLVM_LIBC_FUNCTION(double, log2, (double x)) {
       return x;
     }
     // Normalize denormal inputs.
-    xbits.set_val(x * 0x1.0p52);
+    xbits = FPBits_t(x * 0x1.0p52);
     x_e -= 52;
     x_u = xbits.uintval();
   }
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index e7aeda723b50a..07dedba85e627 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -83,7 +83,7 @@ LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
       return x;
     }
     // Normalize denormal inputs.
-    xbits.set_val(xbits.get_val() * 0x1.0p23f);
+    xbits = FPBits(xbits.get_val() * 0x1.0p23f);
     m -= 23;
   }
 
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index 2ebdddbb2d16d..f1f93468479b1 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -87,7 +87,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
         return static_cast<float>(FPBits::neg_inf());
       }
       // Normalize denormal inputs.
-      xbits.set_val(xbits.get_val() * 0x1.0p23f);
+      xbits = FPBits(xbits.get_val() * 0x1.0p23f);
       m -= 23;
       x_u = xbits.uintval();
     }
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index 37db385c959bf..86ac8a6e26d7c 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -87,7 +87,7 @@ class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::Test {
 
     EXPECT_EQ(str_end - inputString, expectedStrLen);
 
-    EXPECT_EQ(actual_fp.bits, expected_fp.bits);
+    EXPECT_EQ(actual_fp.uintval(), expected_fp.uintval());
     EXPECT_EQ(actual_fp.get_sign(), expected_fp.get_sign());
     EXPECT_EQ(actual_fp.get_exponent(), expected_fp.get_exponent());
     EXPECT_EQ(actual_fp.get_mantissa(), expected_fp.get_mantissa());
diff --git a/libc/test/src/time/difftime_test.cpp b/libc/test/src/time/difftime_test.cpp
index 187df4add1e72..9c3e8d3f30f2b 100644
--- a/libc/test/src/time/difftime_test.cpp
+++ b/libc/test/src/time/difftime_test.cpp
@@ -31,7 +31,7 @@ TEST(LlvmLibcDifftime, SmokeTest) {
   actual_fp = LIBC_NAMESPACE::fputil::FPBits<long double>(
       static_cast<long double>(result));
 
-  EXPECT_EQ(actual_fp.bits, expected_fp.bits);
+  EXPECT_EQ(actual_fp.uintval(), expected_fp.uintval());
   EXPECT_EQ(actual_fp.get_sign(), expected_fp.get_sign());
   EXPECT_EQ(actual_fp.get_exponent(), expected_fp.get_exponent());
   EXPECT_EQ(actual_fp.get_mantissa(), expected_fp.get_mantissa());

From 460ffcddd900b7a869a34790888e3075c6ec6549 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 4 Jan 2024 22:31:18 +0700
Subject: [PATCH 241/313] AMDGPU: Make bf16/v2bf16 legal types (#76215)

There are some intrinsics are using i16 vectors in place of bfloat
vectors.
Move towards making bf16 vectors legal so these can migrate. Leave the
larger vectors for a later change.

Depends #76213 #76214
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |    26 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |    26 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |     1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |    37 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |    98 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |     1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |    42 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 17608 +++++++---------
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |    12 +-
 .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll |    16 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |    10 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     |     4 +-
 llvm/test/CodeGen/AMDGPU/function-args.ll     |   167 +-
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |    65 +-
 .../AMDGPU/gfx-callable-argument-types.ll     |   128 +-
 .../isel-amdgpu-cs-chain-preserve-cc.ll       |    16 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |    25 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |    25 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |    58 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll    |   402 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |    11 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |    11 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |    62 +-
 llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll  |     4 +-
 llvm/test/CodeGen/AMDGPU/select-undef.ll      |     3 +-
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |   850 +-
 26 files changed, 8957 insertions(+), 10751 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 4e317062cec49..296ed3a3c3dc1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3199,7 +3199,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       return true;
     }
     break;
-  case ISD::FP_ROUND:
+  case ISD::FP_ROUND: {
+    EVT VT = Node->getValueType(0);
+    if (VT.getScalarType() == MVT::bf16) {
+      Results.push_back(
+          DAG.getNode(ISD::FP_TO_BF16, SDLoc(Node), VT, Node->getOperand(0)));
+      break;
+    }
+
+    LLVM_FALLTHROUGH;
+  }
   case ISD::BITCAST:
     if ((Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
                                  Node->getValueType(0), dl)))
@@ -3226,12 +3235,19 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       return true;
     }
     break;
-  case ISD::FP_EXTEND:
-    if ((Tmp1 = EmitStackConvert(Node->getOperand(0),
-                                 Node->getOperand(0).getValueType(),
-                                 Node->getValueType(0), dl)))
+  case ISD::FP_EXTEND: {
+    SDValue Op = Node->getOperand(0);
+    EVT SrcVT = Op.getValueType();
+    EVT DstVT = Node->getValueType(0);
+    if (SrcVT.getScalarType() == MVT::bf16) {
+      Results.push_back(DAG.getNode(ISD::BF16_TO_FP, SDLoc(Node), DstVT, Op));
+      break;
+    }
+
+    if ((Tmp1 = EmitStackConvert(Op, SrcVT, DstVT, dl)))
       Results.push_back(Tmp1);
     break;
+  }
   case ISD::BF16_TO_FP: {
     // Always expand bf16 to f32 casts, they lower to ext + shift.
     //
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 9036b26a6f6bc..c5207228dc913 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -22,28 +22,28 @@ def CC_SI_Gfx : CallingConv<[
   // 32 is reserved for the stack pointer
   // 33 is reserved for the frame pointer
   // 34 is reserved for the base pointer
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
     SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29
   ]>>>,
 
-  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
   ]>>>,
 
-  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>
 ]>;
 
 def RetCC_SI_Gfx : CallingConv<[
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -66,7 +66,7 @@ def RetCC_SI_Gfx : CallingConv<[
 
 def CC_SI_SHADER : CallingConv<[
 
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -76,7 +76,7 @@ def CC_SI_SHADER : CallingConv<[
   ]>>>,
 
   // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
-  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -109,7 +109,7 @@ def RetCC_SI_Shader : CallingConv<[
   ]>>,
 
   // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
-  CCIfType<[f32, f16, v2f16] , CCAssignToReg<[
+  CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -188,23 +188,23 @@ def CC_AMDGPU_Func : CallingConv<[
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
 
-  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1, bf16, v2bf16], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
-  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>
 ]>;
 
 // Calling convention for leaf functions
 def RetCC_AMDGPU_Func : CallingConv<[
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
-  CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -223,11 +223,11 @@ def CC_AMDGPU : CallingConv<[
 ]>;
 
 def CC_AMDGPU_CS_CHAIN : CallingConv<[
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(105), !cast<Register>("SGPR"#i))
   >>>,
 
-  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(8, 255), !cast<Register>("VGPR"#i))
   >>>
 ]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 48ee0d942867e..18f434be3cd3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -303,6 +303,7 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
 
     switch (N->getOpcode()) {
     case ISD::BUILD_VECTOR:
+      // TODO: Match load d16 from shl (extload:i16), 16
       MadeChange |= matchLoadD16FromBuildVector(N);
       break;
     default:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8fbc90a6db9fd..b70d33d58b748 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3281,7 +3281,15 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
     return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
   }
 
-  assert(SrcVT == MVT::i64 && "operation should be legal");
+  if (DestVT == MVT::bf16) {
+    SDLoc SL(Op);
+    SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
+    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
+    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
+  }
+
+  if (SrcVT != MVT::i64)
+    return Op;
 
   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
     SDLoc DL(Op);
@@ -3319,7 +3327,15 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
   }
 
-  assert(SrcVT == MVT::i64 && "operation should be legal");
+  if (DestVT == MVT::bf16) {
+    SDLoc SL(Op);
+    SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
+    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
+    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
+  }
+
+  if (SrcVT != MVT::i64)
+    return Op;
 
   // TODO: Factor out code common with LowerUINT_TO_FP.
 
@@ -3517,7 +3533,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
 }
 
-SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
                                              SelectionDAG &DAG) const {
   SDValue Src = Op.getOperand(0);
   unsigned OpOpcode = Op.getOpcode();
@@ -3528,6 +3544,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
   if (SrcVT == MVT::f16 && DestVT == MVT::i16)
     return Op;
 
+  if (SrcVT == MVT::bf16) {
+    SDLoc DL(Op);
+    SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+    return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
+  }
+
   // Promote i16 to i32
   if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
     SDLoc DL(Op);
@@ -3536,6 +3558,9 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
   }
 
+  if (DestVT != MVT::i64)
+    return Op;
+
   if (SrcVT == MVT::f16 ||
       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
     SDLoc DL(Op);
@@ -3546,7 +3571,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
     return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
   }
 
-  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
+  if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
     return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
 
   return SDValue();
@@ -4947,7 +4972,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
     if (DestVT.isVector()) {
       SDValue Src = N->getOperand(0);
-      if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+      if (Src.getOpcode() == ISD::BUILD_VECTOR &&
+          (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
+           isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {
         EVT SrcVT = Src.getValueType();
         unsigned NElts = DestVT.getVectorNumElements();
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b481ae43e8215..a89ef658734b2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -151,14 +151,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     if (Subtarget->useRealTrue16Insts()) {
       addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
       addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
+      addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
     } else {
       addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
       addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+      addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
     }
 
     // Unless there are also VOP3P operations, not operations are really legal.
     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
+    addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
@@ -196,6 +199,41 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                       MVT::i1,     MVT::v32i32},
                      Custom);
 
+  if (isTypeLegal(MVT::bf16)) {
+    for (unsigned Opc :
+         {ISD::FADD,     ISD::FSUB,       ISD::FMUL,    ISD::FDIV,
+          ISD::FREM,     ISD::FMA,        ISD::FMINNUM, ISD::FMAXNUM,
+          ISD::FMINIMUM, ISD::FMAXIMUM,   ISD::FSQRT,   ISD::FCBRT,
+          ISD::FSIN,     ISD::FCOS,       ISD::FPOW,    ISD::FPOWI,
+          ISD::FLDEXP,   ISD::FFREXP,     ISD::FLOG,    ISD::FLOG2,
+          ISD::FLOG10,   ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10,
+          ISD::FCEIL,    ISD::FTRUNC,     ISD::FRINT,   ISD::FNEARBYINT,
+          ISD::FROUND,   ISD::FROUNDEVEN, ISD::FFLOOR,  ISD::FCANONICALIZE,
+          ISD::SETCC}) {
+      // FIXME: The promoted to type shouldn't need to be explicit
+      setOperationAction(Opc, MVT::bf16, Promote);
+      AddPromotedToType(Opc, MVT::bf16, MVT::f32);
+    }
+
+    setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
+
+    setOperationAction(ISD::SELECT, MVT::bf16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
+
+    // TODO: Could make these legal
+    setOperationAction(ISD::FABS, MVT::bf16, Expand);
+    setOperationAction(ISD::FNEG, MVT::bf16, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
+
+    // We only need to custom lower because we can't specify an action for bf16
+    // sources.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
+  }
+
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
   setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
@@ -388,8 +426,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // Avoid stack access for these.
   // TODO: Generalize to more vector types.
   setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
-                     {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
-                      MVT::v4i16, MVT::v4f16},
+                     {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
+                      MVT::v8i8, MVT::v4i16, MVT::v4f16},
                      Custom);
 
   // Deal with vec3 vector operations when widened to vec4.
@@ -498,6 +536,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
   setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
 
+  // Custom lower these because we can't specify a rule based on an illegal
+  // source bf16.
+  setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
+  setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
+
   if (Subtarget->has16BitInsts()) {
     setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
                         ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
@@ -524,9 +567,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
 
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
 
     // F16 - Constant Actions.
     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
 
     // F16 - Load/Store Actions.
     setOperationAction(ISD::LOAD, MVT::f16, Promote);
@@ -534,16 +582,23 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STORE, MVT::f16, Promote);
     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
 
+    // BF16 - Load/Store Actions.
+    setOperationAction(ISD::LOAD, MVT::bf16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
+    setOperationAction(ISD::STORE, MVT::bf16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
+
     // F16 - VOP1 Actions.
     setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
                         ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
                        MVT::f16, Custom);
 
-    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
+    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
 
     // F16 - VOP2 Actions.
-    setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
+    setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
+                       Expand);
     setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
     setOperationAction(ISD::FFREXP, MVT::f16, Custom);
     setOperationAction(ISD::FDIV, MVT::f16, Custom);
@@ -554,8 +609,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
 
     for (MVT VT :
-         {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
-          MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {
+         {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
+          MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
+          MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
         switch (Op) {
         case ISD::LOAD:
@@ -587,7 +643,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // XXX - Do these do anything? Vector constants turn into build_vector.
     setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
 
-    setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
+    setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
+                       Legal);
 
     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
@@ -699,7 +756,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                         ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
                        MVT::v2f16, Legal);
 
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
                        Custom);
 
     setOperationAction(ISD::VECTOR_SHUFFLE,
@@ -3902,6 +3959,26 @@ SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+// Work around DAG legality rules only based on the result type.
+SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  EVT SrcVT = Src.getValueType();
+
+  if (SrcVT.getScalarType() != MVT::bf16)
+    return Op;
+
+  SDLoc SL(Op);
+  SDValue BitCast =
+      DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
+
+  EVT DstVT = Op.getValueType();
+  if (IsStrict)
+    llvm_unreachable("Need STRICT_BF16_TO_FP");
+
+  return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -5452,6 +5529,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerGET_ROUNDING(Op, DAG);
   case ISD::PREFETCH:
     return lowerPREFETCH(Op, DAG);
+  case ISD::FP_EXTEND:
+  case ISD::STRICT_FP_EXTEND:
+    return lowerFP_EXTEND(Op, DAG);
   }
   return SDValue();
 }
@@ -6639,7 +6719,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
 
-  if (ResultVT == MVT::f16) {
+  if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 5bc091d6e84de..00f9ddf11ea7a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -417,6 +417,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
 
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0f127276cfd1b..c299206079670 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1122,7 +1122,7 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
   >;
 
   def : GCNPat <
-    (f64 (fpextend f16:$src)),
+    (f64 (any_fpextend f16:$src)),
     (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
   >;
 
@@ -1515,6 +1515,23 @@ def : BitConvert <v2f16, f32, SReg_32>;
 def : BitConvert <f32, v2f16, SReg_32>;
 def : BitConvert <v2i16, f32, SReg_32>;
 def : BitConvert <f32, v2i16, SReg_32>;
+def : BitConvert <v2bf16, i32, SReg_32>;
+def : BitConvert <i32, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, i32, VGPR_32>;
+def : BitConvert <i32, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, v2i16, SReg_32>;
+def : BitConvert <v2i16, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, v2i16, VGPR_32>;
+def : BitConvert <v2i16, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, v2f16, SReg_32>;
+def : BitConvert <v2f16, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, v2f16, VGPR_32>;
+def : BitConvert <v2f16, v2bf16, VGPR_32>;
+def : BitConvert <f32, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, f32, VGPR_32>;
+def : BitConvert <f32, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, f32, SReg_32>;
+
 
 // 64-bit bitcast
 def : BitConvert <i64, f64, VReg_64>;
@@ -1958,19 +1975,21 @@ def : GCNPat <
   let SubtargetPredicate = HasPackedFP32Ops;
 }
 
+foreach fp16vt = [f16, bf16] in {
+
 def : GCNPat <
-  (fcopysign f16:$src0, f16:$src1),
+  (fcopysign fp16vt:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
 >;
 
 def : GCNPat <
-  (fcopysign f32:$src0, f16:$src1),
+  (fcopysign f32:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (V_LSHLREV_B32_e64 (i32 16), $src1))
 >;
 
 def : GCNPat <
-  (fcopysign f64:$src0, f16:$src1),
+  (fcopysign f64:$src0, fp16vt:$src1),
   (REG_SEQUENCE SReg_64,
     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
     (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
@@ -1978,16 +1997,17 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (fcopysign f16:$src0, f32:$src1),
+  (fcopysign fp16vt:$src0, f32:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), $src1))
 >;
 
 def : GCNPat <
-  (fcopysign f16:$src0, f64:$src1),
+  (fcopysign fp16vt:$src0, f64:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
 >;
+} // End foreach fp16vt = [f16, bf16]
 
 /********** ================== **********/
 /********** Immediate Patterns **********/
@@ -2026,6 +2046,11 @@ def : GCNPat <
   (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+def : GCNPat <
+  (VGPRImm<(bf16 fpimm)>:$imm),
+  (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
+>;
+
 // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
 // immediate and wil be expanded as needed, but we will only use these patterns
 // for values which can be encoded.
@@ -2059,6 +2084,11 @@ def : GCNPat <
   (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+def : GCNPat <
+  (bf16 fpimm:$imm),
+  (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
 def : GCNPat <
   (p5 frameindex:$fi),
   (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index adc23860e8965..5243262117e72 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -164,18 +164,13 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_load_global_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_load_global_v3bf16:
@@ -183,9 +178,6 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_load_global_v3bf16:
@@ -193,10 +185,6 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x bfloat>, ptr addrspace(1) %ptr
   ret <3 x bfloat> %load
@@ -2083,29 +2071,29 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9-LABEL: test_load_store_bf16_to_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v4, off
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_bf16_to_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[2:3], v4, off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_load_store_bf16_to_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[2:3], v4, off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   %val.f32 = fpext bfloat %val to float
@@ -2158,10 +2146,10 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9-LABEL: test_load_store_bf16_to_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2169,20 +2157,21 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX10-LABEL: test_load_store_bf16_to_f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_load_store_bf16_to_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
@@ -2503,7 +2492,6 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
 ; GFX8-LABEL: test_arg_store:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    flat_store_short v[1:2], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2511,20 +2499,20 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
 ; GFX9-LABEL: test_arg_store:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX9-NEXT:    global_store_short v[1:2], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX10-NEXT:    global_store_short v[1:2], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_arg_store:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
+; GFX11-NEXT:    global_store_b16 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %in, ptr addrspace(1) %out
   ret void
@@ -2905,8 +2893,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX8-LABEL: test_inreg_arg_store:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s34, s4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2915,7 +2902,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2923,14 +2910,14 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_inreg_arg_store:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v2, off
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %in, ptr addrspace(1) %out
   ret void
@@ -2956,28 +2943,27 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
 ; GFX8-LABEL: test_byval:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_byval:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_byval:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_byval:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32
+; GFX11-NEXT:    scratch_store_b16 off, v0, s32
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %val, ptr addrspace(5) %bv
   %retval = load bfloat, ptr addrspace(5) %bv
@@ -3004,7 +2990,6 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
 ; GFX8-LABEL: test_sret:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3012,20 +2997,20 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
 ; GFX9-LABEL: test_sret:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_sret:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_sret:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v0, v1, off
+; GFX11-NEXT:    scratch_store_b16 v0, v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %val, ptr addrspace(5) %sret
   ret void
@@ -3245,25 +3230,21 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
 ; GFX8-LABEL: test_ret_v3bf16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ret_v3bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_ret_v3bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ret_v3bf16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <3 x bfloat> %in
@@ -3450,7 +3431,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
@@ -3480,7 +3460,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
@@ -3510,7 +3490,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
@@ -3540,7 +3520,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off dlc
+; GFX11-NEXT:    scratch_store_b16 v1, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
@@ -3836,7 +3816,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -3869,7 +3848,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -3902,7 +3880,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -3935,7 +3912,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -4061,18 +4037,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    buffer_store_short v1, v6, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v2
-; GFX8-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
-; GFX8-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
@@ -4101,13 +4069,9 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
@@ -4137,13 +4101,9 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
@@ -4173,18 +4133,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2 dlc
+; GFX11-NEXT:    scratch_store_b32 v2, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v2, v0, off dlc
+; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -4333,32 +4287,16 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 12, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    buffer_store_short v3, v10, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 12, v4
+; GFX8-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v4
-; GFX8-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v4
-; GFX8-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
-; GFX8-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
@@ -4387,21 +4325,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
@@ -4431,21 +4361,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
+; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
@@ -4475,28 +4397,18 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 14, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 12, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 10, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 6, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v6, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v7, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v2, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v2, off offset:8 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v9, v1, off dlc
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX11-NEXT:    scratch_store_b32 v6, v3, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v1, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b32 v4, v2, off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v0, off offset:2 dlc
+; GFX11-NEXT:    scratch_store_b32 v4, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v0, off dlc
+; GFX11-NEXT:    scratch_store_b32 v4, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -4709,60 +4621,28 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 28, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    buffer_store_short v7, v18, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 28, v8
+; GFX8-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 12, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 30, v8
-; GFX8-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 26, v8
-; GFX8-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 22, v8
-; GFX8-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 18, v8
-; GFX8-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v8
-; GFX8-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v8
-; GFX8-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v8
-; GFX8-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
@@ -4791,37 +4671,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
+; GFX9-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
+; GFX9-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
+; GFX9-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
+; GFX9-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
@@ -4851,37 +4715,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
+; GFX10-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
+; GFX10-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
+; GFX10-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
+; GFX10-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
@@ -4911,51 +4759,28 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 30, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 28, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 28, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 24, v8
 ; GFX11-NEXT:    v_add_nc_u32_e32 v12, 20, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 18, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 12, v8
 ; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v10, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v11, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 26, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 24, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 22, v8
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v10, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v11, v5, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v12, v5, off dlc
+; GFX11-NEXT:    scratch_store_b32 v10, v7, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v13, v4, off dlc
+; GFX11-NEXT:    scratch_store_b32 v11, v6, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 14, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 10, v8
-; GFX11-NEXT:    scratch_store_b16 v8, v4, off offset:16 dlc
+; GFX11-NEXT:    scratch_store_b32 v12, v5, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v8
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v5, v3, off dlc
+; GFX11-NEXT:    scratch_store_b32 v8, v4, off offset:16 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v6, v3, off dlc
+; GFX11-NEXT:    scratch_store_b32 v13, v3, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v2, off dlc
+; GFX11-NEXT:    scratch_store_b32 v8, v2, off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v2, off offset:8 dlc
+; GFX11-NEXT:    scratch_store_b32 v8, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v0, off offset:2 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v0, off dlc
+; GFX11-NEXT:    scratch_store_b32 v8, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -4995,44 +4820,37 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) {
 ; GFX8-LABEL: test_alloca_load_store_ret:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_alloca_load_store_ret:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 glc
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_alloca_load_store_ret:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], s32 glc dlc
+; GFX10-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_alloca_load_store_ret:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
+; GFX11-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, s32 glc dlc
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %in.addr = alloca bfloat, align 2, addrspace(5)
@@ -5219,7 +5037,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
 ; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x7c, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
@@ -5334,7 +5151,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
 ; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5380,7 +5197,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_overflow_stack:
@@ -5396,7 +5213,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
 ; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v1, s0 offset:128
+; GFX11-NEXT:    scratch_store_b16 off, v1, s0 offset:128
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
 ; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
@@ -5521,11 +5338,11 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
@@ -5597,12 +5414,12 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
@@ -5687,43 +5504,45 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_ushort v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX10-NEXT:    global_load_ushort v4, v[0:1], off offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_u16 v4, v[0:1], off offset:8
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <5 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <5 x bfloat> %load to <5 x float>
@@ -5781,40 +5600,40 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx3 v[3:5], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx3 v[3:5], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    global_load_b96 v[3:5], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <6 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <6 x bfloat> %load to <6 x float>
@@ -5878,46 +5697,46 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <8 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <8 x bfloat> %load to <8 x float>
@@ -6013,78 +5832,78 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[16:19], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <16 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <16 x bfloat> %load to <16 x float>
@@ -6244,138 +6063,138 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v32
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v33
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v34
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v35
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b128 v[32:35], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[36:39], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:32
-; GFX11-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:48
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:32
+; GFX11-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <32 x bfloat> %load to <32 x float>
@@ -6616,12 +6435,12 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
 ; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
 ; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
@@ -6731,62 +6550,60 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:8
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX10-NEXT:    global_load_ushort v4, v[0:1], off offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_u16 v4, v[0:1], off offset:8
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <5 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <5 x bfloat> %load to <5 x double>
@@ -6864,18 +6681,18 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
@@ -7003,22 +6820,22 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
@@ -7211,39 +7028,39 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v16
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v17
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v24
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v25
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v28
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v32
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
@@ -7253,39 +7070,39 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
 ; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v12
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v6
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v15
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v18
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v16
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v17
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v24
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v20
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v21
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v25
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v28
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v29
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
@@ -8717,38 +8534,38 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fadd_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd bfloat %a, %b
   ret bfloat %op
@@ -8784,53 +8601,53 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -8874,69 +8691,82 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_add_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
@@ -8988,85 +8818,82 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_add_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -9150,143 +8977,138 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    v_add_f32_e32 v4, v8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v9, v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_add_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_add_f32_e32 v10, v13, v12
-; GFX10-NEXT:    v_add_f32_e32 v11, v15, v14
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v6, v12, v11
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_add_f32_e32 v4, v11, v10
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_add_f32_e32 v5, v10, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_add_f32_e32 v6, v12, v11
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT:    v_dual_add_f32 v10, v12, v11 :: v_dual_add_f32 v11, v14, v13
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -9438,244 +9260,252 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    v_add_f32_e32 v8, v16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v17, v18, v17
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v18, v19, v18
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v19, v20, v19
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v20, v21, v20
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v21, v22, v21
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v22, v23, v22
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v8, v16, v8
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v9
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
-; GFX10-NEXT:    v_add_f32_e32 v19, v22, v21
-; GFX10-NEXT:    v_add_f32_e32 v20, v24, v23
-; GFX10-NEXT:    v_add_f32_e32 v21, v26, v25
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_add_f32_e32 v22, v23, v22
-; GFX10-NEXT:    v_add_f32_e32 v23, v25, v24
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_add_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_add_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_add_f32_e32 v9, v16, v11
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    v_add_f32_e32 v10, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v11
+; GFX10-NEXT:    v_add_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_add_f32_e32 v12, v17, v16
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_add_f32 v5, v5, v13
-; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_add_f32 v18, v20, v19
-; GFX11-NEXT:    v_add_f32_e32 v19, v22, v21
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX11-NEXT:    v_add_f32_e32 v21, v26, v25
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v20, v24, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_add_f32 v3, v3, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_add_f32 v22, v23, v22
-; GFX11-NEXT:    v_add_f32_e32 v23, v25, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
+; GFX11-NEXT:    v_add_f32_e32 v9, v20, v19
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    v_add_f32_e32 v8, v18, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_dual_add_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v11
+; GFX11-NEXT:    v_add_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_add_f32_e32 v12, v17, v16
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_add_f32 v7, v7, v15
+; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -10083,493 +9913,484 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_add_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX8-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT:    v_add_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v17
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_add_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_add_f32_e32 v9, v9, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_add_f32_e32 v10, v10, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_add_f32_e32 v11, v11, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_add_f32_e32 v12, v12, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_add_f32_e32 v13, v13, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_add_f32_e32 v14, v14, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v58, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v59, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v44, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v45, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v46, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v38, v39, v38
-; GFX9-NEXT:    v_add_f32_e32 v39, v49, v48
-; GFX9-NEXT:    v_add_f32_e32 v48, v51, v50
-; GFX9-NEXT:    v_add_f32_e32 v51, v41, v40
-; GFX9-NEXT:    v_add_f32_e32 v40, v59, v58
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_add_f32_e32 v49, v53, v52
-; GFX9-NEXT:    v_add_f32_e32 v50, v55, v54
-; GFX9-NEXT:    v_add_f32_e32 v52, v43, v42
-; GFX9-NEXT:    v_add_f32_e32 v53, v45, v44
-; GFX9-NEXT:    v_add_f32_e32 v54, v47, v46
-; GFX9-NEXT:    v_add_f32_e32 v55, v57, v56
-; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
-; GFX9-NEXT:    v_add_f32_e32 v34, v35, v34
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v36, v37, v36
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_add_f32_e32 v33, v35, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v53, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v52, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v51, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v50, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v49, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v48, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v39, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v38, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v36, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v34, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v17
+; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v17
+; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v17
+; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v17
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v17
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v17
+; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v17
+; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v17
+; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v17
+; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v17
+; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v17
+; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v17
+; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_add_f32_e32 v35, v37, v35
-; GFX9-NEXT:    v_add_f32_e32 v15, v15, v31
-; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_add_f32_e32 v53, v54, v53
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_add_f32_e32 v55, v64, v55
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v65, v66, v65
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_add_f32_e32 v67, v68, v67
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_add_f32_e32 v21, v53, v52
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_add_f32_e32 v22, v55, v54
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX10-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX10-NEXT:    v_add_f32_e32 v34, v35, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_add_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_add_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_add_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_add_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_add_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
+; GFX10-NEXT:    v_add_f32_e32 v36, v37, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
+; GFX10-NEXT:    v_add_f32_e32 v38, v39, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; GFX10-NEXT:    v_add_f32_e32 v48, v49, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX10-NEXT:    v_add_f32_e32 v50, v51, v50
+; GFX10-NEXT:    v_add_f32_e32 v23, v65, v64
+; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_add_f32_e32 v24, v67, v66
+; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_add_f32_e32 v25, v33, v68
+; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_add_f32_e32 v16, v35, v16
+; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_add_f32_e32 v17, v37, v17
+; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_add_f32_e32 v18, v39, v18
+; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_add_f32_e32 v19, v49, v19
+; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; GFX10-NEXT:    v_add_f32_e32 v20, v20, v21
+; GFX10-NEXT:    v_add_f32_e32 v15, v15, v22
+; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_dual_add_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
+; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v33, v34, v33 :: v_dual_add_f32 v34, v36, v35
-; GFX11-NEXT:    v_dual_add_f32 v35, v38, v37 :: v_dual_add_f32 v12, v12, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v5, v5, v21
-; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v9, v9, v25
+; GFX11-NEXT:    v_add_f32_e32 v25, v69, v68
+; GFX11-NEXT:    v_dual_add_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
+; GFX11-NEXT:    v_add_f32_e32 v27, v81, v80
+; GFX11-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX11-NEXT:    v_dual_add_f32 v28, v83, v82 :: v_dual_add_f32 v29, v85, v84
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v22, v55, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_add_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_add_f32_e32 v23, v65, v64
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_add_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
+; GFX11-NEXT:    v_add_f32_e32 v18, v39, v38
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_add_f32_e32 v19, v49, v48
+; GFX11-NEXT:    v_add_f32_e32 v17, v37, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v21, v53, v52
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_add_f32_e32 v16, v35, v34
+; GFX11-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v13, v13, v29
-; GFX11-NEXT:    v_dual_add_f32 v15, v15, v17 :: v_dual_add_f32 v14, v14, v30
-; GFX11-NEXT:    v_add_f32_e32 v36, v48, v39
-; GFX11-NEXT:    v_dual_add_f32 v48, v64, v55 :: v_dual_add_f32 v37, v50, v49
-; GFX11-NEXT:    v_add_f32_e32 v50, v68, v67
-; GFX11-NEXT:    v_dual_add_f32 v38, v52, v51 :: v_dual_add_f32 v51, v70, v69
-; GFX11-NEXT:    v_dual_add_f32 v52, v80, v71 :: v_dual_add_f32 v39, v54, v53
-; GFX11-NEXT:    v_dual_add_f32 v53, v82, v81 :: v_dual_add_f32 v54, v84, v83
-; GFX11-NEXT:    v_add_f32_e32 v55, v86, v85
-; GFX11-NEXT:    v_add_f32_e32 v49, v66, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <32 x bfloat> %a, %b
   ret <32 x bfloat> %op
@@ -10595,34 +10416,34 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX8-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd bfloat %arg0, 1.0
   ret bfloat %add
@@ -10648,34 +10469,34 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX8-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd bfloat %arg0, 42.0
   ret bfloat %add
@@ -10703,38 +10524,38 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fsub_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub bfloat %a, %b
   ret bfloat %op
@@ -10770,53 +10591,53 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -10860,69 +10681,82 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_sub_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_sub_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
@@ -10974,85 +10808,82 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_sub_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_sub_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_sub_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11080,38 +10911,38 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fmul_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul bfloat %a, %b
   ret bfloat %op
@@ -11147,53 +10978,53 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -11237,69 +11068,82 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
@@ -11351,85 +11195,82 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11513,143 +11354,138 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    v_mul_f32_e32 v4, v8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v9, v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v10, v11, v10
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_mul_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_mul_f32_e32 v10, v13, v12
-; GFX10-NEXT:    v_mul_f32_e32 v11, v15, v14
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v4, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_mul_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f32_e32 v6, v12, v11
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_mul_f32_e32 v4, v11, v10
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_mul_f32_e32 v5, v10, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_mul_f32_e32 v6, v12, v11
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v9, v10, v9
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX11-NEXT:    v_dual_mul_f32 v10, v12, v11 :: v_dual_mul_f32 v11, v14, v13
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -11801,244 +11637,252 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    v_mul_f32_e32 v8, v16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_mul_f32_e32 v17, v18, v17
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v18, v19, v18
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v19, v20, v19
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v20, v21, v20
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v21, v22, v21
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v22, v23, v22
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v8, v16, v8
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v9
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v9
+; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v9
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v9
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_mul_f32_e32 v18, v20, v19
-; GFX10-NEXT:    v_mul_f32_e32 v19, v22, v21
-; GFX10-NEXT:    v_mul_f32_e32 v20, v24, v23
-; GFX10-NEXT:    v_mul_f32_e32 v21, v26, v25
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_mul_f32_e32 v22, v23, v22
-; GFX10-NEXT:    v_mul_f32_e32 v23, v25, v24
-; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_mul_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_mul_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_mul_f32_e32 v9, v16, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    v_mul_f32_e32 v10, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v11
+; GFX10-NEXT:    v_mul_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_mul_f32_e32 v12, v17, v16
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_mul_f32 v5, v5, v13
-; GFX11-NEXT:    v_dual_mul_f32 v17, v18, v17 :: v_dual_mul_f32 v18, v20, v19
-; GFX11-NEXT:    v_mul_f32_e32 v19, v22, v21
-; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX11-NEXT:    v_mul_f32_e32 v21, v26, v25
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v20, v24, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_mul_f32 v3, v3, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_mul_f32 v22, v23, v22
-; GFX11-NEXT:    v_mul_f32_e32 v23, v25, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
+; GFX11-NEXT:    v_mul_f32_e32 v9, v20, v19
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    v_mul_f32_e32 v8, v18, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_dual_mul_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v11
+; GFX11-NEXT:    v_mul_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_mul_f32_e32 v12, v17, v16
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_mul_f32 v7, v7, v15
+; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -12446,493 +12290,484 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_mul_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_mul_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_mul_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_mul_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_mul_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_mul_f32_e32 v25, v32, v25
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX8-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT:    v_mul_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v17
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v58, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v59, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v44, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v45, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v46, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v38, v39, v38
-; GFX9-NEXT:    v_mul_f32_e32 v39, v49, v48
-; GFX9-NEXT:    v_mul_f32_e32 v48, v51, v50
-; GFX9-NEXT:    v_mul_f32_e32 v51, v41, v40
-; GFX9-NEXT:    v_mul_f32_e32 v40, v59, v58
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_mul_f32_e32 v49, v53, v52
-; GFX9-NEXT:    v_mul_f32_e32 v50, v55, v54
-; GFX9-NEXT:    v_mul_f32_e32 v52, v43, v42
-; GFX9-NEXT:    v_mul_f32_e32 v53, v45, v44
-; GFX9-NEXT:    v_mul_f32_e32 v54, v47, v46
-; GFX9-NEXT:    v_mul_f32_e32 v55, v57, v56
-; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_mul_f32_e32 v32, v33, v32
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
-; GFX9-NEXT:    v_mul_f32_e32 v34, v35, v34
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v36, v37, v36
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_mul_f32_e32 v33, v35, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v53, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v52, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v51, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v50, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v49, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v48, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v39, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v38, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v36, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v34, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v17
+; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v17
+; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v17
+; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v17
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v17
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v17
+; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v17
+; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v17
+; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v17
+; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v17
+; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v17
+; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v17
+; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_mul_f32_e32 v35, v37, v35
-; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v31
-; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_mul_f32_e32 v53, v54, v53
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_mul_f32_e32 v55, v64, v55
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_mul_f32_e32 v65, v66, v65
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_mul_f32_e32 v67, v68, v67
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_mul_f32_e32 v49, v50, v49
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_mul_f32_e32 v51, v52, v51
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_mul_f32_e32 v21, v53, v52
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_mul_f32_e32 v22, v55, v54
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX10-NEXT:    v_mul_f32_e32 v32, v33, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX10-NEXT:    v_mul_f32_e32 v34, v35, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_mul_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_mul_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_mul_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_mul_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_mul_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
+; GFX10-NEXT:    v_mul_f32_e32 v36, v37, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
+; GFX10-NEXT:    v_mul_f32_e32 v38, v39, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; GFX10-NEXT:    v_mul_f32_e32 v48, v49, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX10-NEXT:    v_mul_f32_e32 v50, v51, v50
+; GFX10-NEXT:    v_mul_f32_e32 v23, v65, v64
+; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_mul_f32_e32 v24, v67, v66
+; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_mul_f32_e32 v25, v33, v68
+; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_mul_f32_e32 v16, v35, v16
+; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_mul_f32_e32 v17, v37, v17
+; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_mul_f32_e32 v18, v39, v18
+; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_mul_f32_e32 v19, v49, v19
+; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_mul_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; GFX10-NEXT:    v_mul_f32_e32 v20, v20, v21
+; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v22
+; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_dual_mul_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
+; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_dual_mul_f32 v33, v34, v33 :: v_dual_mul_f32 v34, v36, v35
-; GFX11-NEXT:    v_dual_mul_f32 v35, v38, v37 :: v_dual_mul_f32 v12, v12, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v5, v5, v21
-; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v9, v9, v25
+; GFX11-NEXT:    v_mul_f32_e32 v25, v69, v68
+; GFX11-NEXT:    v_dual_mul_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
+; GFX11-NEXT:    v_mul_f32_e32 v27, v81, v80
+; GFX11-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX11-NEXT:    v_dual_mul_f32 v28, v83, v82 :: v_dual_mul_f32 v29, v85, v84
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_mul_f32_e32 v22, v55, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_mul_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_mul_f32_e32 v23, v65, v64
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_mul_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
+; GFX11-NEXT:    v_mul_f32_e32 v18, v39, v38
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_mul_f32_e32 v19, v49, v48
+; GFX11-NEXT:    v_mul_f32_e32 v17, v37, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v21, v53, v52
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_mul_f32_e32 v16, v35, v34
+; GFX11-NEXT:    v_mul_f32_e32 v32, v33, v32
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v13, v13, v29
-; GFX11-NEXT:    v_dual_mul_f32 v15, v15, v17 :: v_dual_mul_f32 v14, v14, v30
-; GFX11-NEXT:    v_mul_f32_e32 v36, v48, v39
-; GFX11-NEXT:    v_dual_mul_f32 v48, v64, v55 :: v_dual_mul_f32 v37, v50, v49
-; GFX11-NEXT:    v_mul_f32_e32 v50, v68, v67
-; GFX11-NEXT:    v_dual_mul_f32 v38, v52, v51 :: v_dual_mul_f32 v51, v70, v69
-; GFX11-NEXT:    v_dual_mul_f32 v52, v80, v71 :: v_dual_mul_f32 v39, v54, v53
-; GFX11-NEXT:    v_dual_mul_f32 v53, v82, v81 :: v_dual_mul_f32 v54, v84, v83
-; GFX11-NEXT:    v_mul_f32_e32 v55, v86, v85
-; GFX11-NEXT:    v_mul_f32_e32 v49, v66, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <32 x bfloat> %a, %b
   ret <32 x bfloat> %op
@@ -12980,8 +12815,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fdiv_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -12993,14 +12828,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
@@ -13012,14 +12847,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -13031,14 +12866,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -13056,7 +12891,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fdiv bfloat %a, %b
   ret bfloat %op
@@ -13084,34 +12919,25 @@ define bfloat @v_fabs_bf16(bfloat %a) {
 ; GFX8-LABEL: v_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   ret bfloat %op
@@ -13130,22 +12956,33 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX8-LABEL: s_fabs_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fabs_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fabs_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fabs_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   %cast = bitcast bfloat %op to i16
@@ -13170,25 +13007,25 @@ define bfloat @v_fneg_bf16(bfloat %a) {
 ; GFX8-LABEL: v_fneg_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fneg_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fneg_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fneg bfloat %a
   ret bfloat %op
@@ -13212,7 +13049,6 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX8-LABEL: s_fneg_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -13221,7 +13057,6 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX9-LABEL: s_fneg_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -13230,7 +13065,6 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX10-LABEL: s_fneg_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -13238,11 +13072,9 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX11-LABEL: s_fneg_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = fneg bfloat %a
@@ -13276,43 +13108,25 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 ; GFX8-LABEL: v_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
   %op = fneg bfloat %fabs
@@ -13335,7 +13149,6 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX8-LABEL: s_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -13344,7 +13157,6 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX9-LABEL: s_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -13353,7 +13165,6 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX10-LABEL: s_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -13361,11 +13172,9 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX11-LABEL: s_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
@@ -13410,46 +13219,38 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_minnum_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
@@ -13493,68 +13294,53 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
@@ -13610,91 +13396,82 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_min_f32 v0, v0, v2
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_min_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
@@ -13762,113 +13539,82 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_min_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_min_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_min_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_min_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_min_f32 v1, v1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    v_min_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -13984,206 +13730,138 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    v_min_f32_e32 v4, v8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_min_f32_e32 v9, v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_min_f32_e32 v10, v11, v10
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_min_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_min_f32_e32 v9, v9, v10
-; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_min_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_min_f32_e32 v4, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_min_f32_e32 v5, v10, v9
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_min_f32_e32 v6, v12, v11
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_minnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_min_f32 v8, v9, v8
-; GFX11-NEXT:    v_min_f32_e32 v9, v11, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_f32_e32 v10, v11, v10
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_min_f32_e32 v4, v11, v10
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_min_f32_e32 v5, v10, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_min_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v3, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_min_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -14399,356 +14077,252 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    v_min_f32_e32 v8, v16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_min_f32_e32 v17, v18, v17
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_min_f32_e32 v18, v19, v18
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_min_f32_e32 v19, v20, v19
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_min_f32_e32 v20, v21, v20
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_min_f32_e32 v21, v22, v21
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_min_f32_e32 v22, v23, v22
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v8, v16, v8
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v9
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v9
+; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v9
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v9
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v9
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
-; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
-; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
-; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
-; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_min_f32_e32 v18, v19, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_min_f32_e32 v19, v21, v20
-; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
-; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v20, v21, v20
-; GFX10-NEXT:    v_min_f32_e32 v21, v23, v22
-; GFX10-NEXT:    v_min_f32_e32 v22, v25, v24
-; GFX10-NEXT:    v_min_f32_e32 v23, v27, v26
-; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_min_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_min_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_min_f32_e32 v9, v16, v11
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    v_min_f32_e32 v10, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v11
+; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_min_f32_e32 v12, v17, v16
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_min_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
-; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_dual_min_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
-; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
-; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
-; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
-; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_dual_min_f32 v21, v23, v22 :: v_dual_min_f32 v22, v25, v24
-; GFX11-NEXT:    v_dual_min_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
-; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_dual_min_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_min_f32 v5, v5, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v3, v3, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_min_f32 v1, v1, v9
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
+; GFX11-NEXT:    v_min_f32_e32 v9, v20, v19
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_min_f32_e32 v8, v18, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_dual_min_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v11
+; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_min_f32_e32 v12, v17, v16
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_min_f32 v7, v7, v15
+; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -15284,703 +14858,484 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_min_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_min_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_min_f32_e32 v28, v32, v28
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_min_f32_e32 v27, v27, v33
-; GFX8-NEXT:    v_min_f32_e32 v15, v15, v32
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_min_f32_e32 v32, v33, v32
-; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_min_f32_e32 v26, v33, v26
-; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_min_f32_e32 v25, v33, v25
-; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
-; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
+; GFX8-NEXT:    v_min_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v17
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_min_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_min_f32_e32 v9, v9, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_min_f32_e32 v10, v10, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_min_f32_e32 v11, v11, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_min_f32_e32 v12, v12, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_min_f32_e32 v13, v13, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_min_f32_e32 v14, v14, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
-; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
-; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
-; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
-; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_min_f32_e32 v32, v34, v33
-; GFX9-NEXT:    v_min_f32_e32 v33, v37, v36
-; GFX9-NEXT:    v_min_f32_e32 v37, v51, v50
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    v_min_f32_e32 v34, v39, v38
-; GFX9-NEXT:    v_min_f32_e32 v38, v53, v52
-; GFX9-NEXT:    v_min_f32_e32 v50, v51, v50
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX9-NEXT:    v_min_f32_e32 v51, v52, v51
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_min_f32_e32 v39, v55, v54
-; GFX9-NEXT:    v_min_f32_e32 v52, v53, v52
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
-; GFX9-NEXT:    v_min_f32_e32 v53, v54, v53
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
-; GFX9-NEXT:    v_min_f32_e32 v36, v49, v48
-; GFX9-NEXT:    v_min_f32_e32 v48, v41, v40
-; GFX9-NEXT:    v_min_f32_e32 v54, v55, v54
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-NEXT:    v_min_f32_e32 v55, v40, v55
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
-; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX9-NEXT:    v_min_f32_e32 v49, v42, v49
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v15, v15, v35
-; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v17
+; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v17
+; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v17
+; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v17
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v17
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v17
+; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v17
+; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v17
+; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v17
+; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v17
+; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v17
+; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v17
+; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
-; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
-; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
-; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
-; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_min_f32_e32 v53, v54, v53
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_min_f32_e32 v55, v64, v55
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_min_f32_e32 v65, v66, v65
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_min_f32_e32 v67, v68, v67
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
-; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
-; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
-; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
-; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
-; GFX10-NEXT:    v_min_f32_e32 v32, v34, v32
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_min_f32_e32 v49, v50, v49
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_min_f32_e32 v51, v52, v51
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_min_f32_e32 v21, v53, v52
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_min_f32_e32 v22, v55, v54
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX10-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX10-NEXT:    v_min_f32_e32 v34, v35, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
-; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
-; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
-; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_min_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_min_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_min_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_min_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_min_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
+; GFX10-NEXT:    v_min_f32_e32 v36, v37, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
+; GFX10-NEXT:    v_min_f32_e32 v38, v39, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; GFX10-NEXT:    v_min_f32_e32 v48, v49, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX10-NEXT:    v_min_f32_e32 v50, v51, v50
+; GFX10-NEXT:    v_min_f32_e32 v23, v65, v64
+; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_min_f32_e32 v24, v67, v66
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_min_f32_e32 v25, v33, v68
+; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_min_f32_e32 v16, v35, v16
+; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_min_f32_e32 v17, v37, v17
+; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_min_f32_e32 v18, v39, v18
+; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_min_f32_e32 v19, v49, v19
+; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_min_f32_e32 v16, v33, v16
-; GFX10-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; GFX10-NEXT:    v_min_f32_e32 v20, v20, v21
+; GFX10-NEXT:    v_min_f32_e32 v15, v15, v22
+; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
-; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
-; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
-; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
-; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
-; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
-; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
-; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
-; GFX11-NEXT:    v_dual_min_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
-; GFX11-NEXT:    v_dual_min_f32 v34, v36, v35 :: v_dual_min_f32 v35, v38, v37
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
-; GFX11-NEXT:    v_dual_min_f32 v36, v48, v39 :: v_dual_min_f32 v37, v50, v49
-; GFX11-NEXT:    v_min_f32_e32 v39, v54, v53
-; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_min_f32 v1, v1, v17
-; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
-; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
-; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
-; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
-; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
-; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
-; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
-; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
-; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
-; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    v_dual_min_f32 v38, v52, v51 :: v_dual_min_f32 v53, v82, v81
-; GFX11-NEXT:    v_dual_min_f32 v48, v64, v55 :: v_dual_min_f32 v55, v86, v85
-; GFX11-NEXT:    v_dual_min_f32 v49, v66, v65 :: v_dual_min_f32 v50, v68, v67
-; GFX11-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX11-NEXT:    v_dual_min_f32 v51, v70, v69 :: v_dual_min_f32 v52, v80, v71
-; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_min_f32 v54, v84, v83
-; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_min_f32 v14, v14, v30
-; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_min_f32 v12, v12, v28
-; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_min_f32 v8, v8, v24
-; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_min_f32 v4, v4, v20
-; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_min_f32 v2, v2, v18
-; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_dual_min_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
+; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_min_f32 v9, v9, v25
+; GFX11-NEXT:    v_min_f32_e32 v25, v69, v68
+; GFX11-NEXT:    v_dual_min_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
+; GFX11-NEXT:    v_min_f32_e32 v27, v81, v80
+; GFX11-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX11-NEXT:    v_dual_min_f32 v28, v83, v82 :: v_dual_min_f32 v29, v85, v84
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_min_f32_e32 v22, v55, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_min_f32_e32 v23, v65, v64
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_min_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
+; GFX11-NEXT:    v_min_f32_e32 v18, v39, v38
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_min_f32_e32 v19, v49, v48
+; GFX11-NEXT:    v_min_f32_e32 v17, v37, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v21, v53, v52
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_min_f32_e32 v16, v35, v34
+; GFX11-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
 ; GFX11-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v16, v32, v16
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
   ret <32 x bfloat> %op
@@ -16021,46 +15376,38 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_maxnum_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
@@ -16104,68 +15451,53 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
@@ -16221,91 +15553,82 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_max_f32 v0, v0, v2
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_max_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
@@ -16373,113 +15696,82 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_max_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_max_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_max_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    v_max_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -16595,206 +15887,138 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    v_max_f32_e32 v4, v8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v9, v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v10, v11, v10
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v10
-; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_max_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_max_f32_e32 v6, v12, v11
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v9, v8
-; GFX11-NEXT:    v_max_f32_e32 v9, v11, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_max_f32_e32 v10, v11, v10
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_max_f32_e32 v4, v11, v10
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_max_f32_e32 v5, v10, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_max_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v3, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -17010,356 +16234,252 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    v_max_f32_e32 v8, v16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v17, v18, v17
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v18, v19, v18
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v19, v20, v19
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v20, v21, v20
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v21, v22, v21
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v22, v23, v22
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v8, v16, v8
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v9
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v9
+; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v9
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v9
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v9
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
-; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
-; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
-; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
-; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_max_f32_e32 v19, v21, v20
-; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
-; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v20, v21, v20
-; GFX10-NEXT:    v_max_f32_e32 v21, v23, v22
-; GFX10-NEXT:    v_max_f32_e32 v22, v25, v24
-; GFX10-NEXT:    v_max_f32_e32 v23, v27, v26
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_max_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_max_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_max_f32_e32 v9, v16, v11
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    v_max_f32_e32 v10, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_max_f32_e32 v12, v17, v16
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_max_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
-; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_dual_max_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
-; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
-; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
-; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
-; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_dual_max_f32 v21, v23, v22 :: v_dual_max_f32 v22, v25, v24
-; GFX11-NEXT:    v_dual_max_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
-; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_max_f32 v5, v5, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v3, v3, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_max_f32 v1, v1, v9
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
+; GFX11-NEXT:    v_max_f32_e32 v9, v20, v19
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_max_f32_e32 v8, v18, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_dual_max_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v11
+; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_max_f32_e32 v12, v17, v16
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v7, v7, v15
+; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -17895,703 +17015,484 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_max_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_max_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_max_f32_e32 v28, v32, v28
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_max_f32_e32 v27, v27, v33
-; GFX8-NEXT:    v_max_f32_e32 v15, v15, v32
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_max_f32_e32 v32, v33, v32
-; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_max_f32_e32 v26, v33, v26
-; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_max_f32_e32 v25, v33, v25
-; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
-; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
+; GFX8-NEXT:    v_max_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v17
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_max_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_max_f32_e32 v9, v9, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_max_f32_e32 v10, v10, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_max_f32_e32 v11, v11, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_max_f32_e32 v12, v12, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_max_f32_e32 v13, v13, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_max_f32_e32 v14, v14, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
-; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
-; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
-; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
-; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_max_f32_e32 v32, v34, v33
-; GFX9-NEXT:    v_max_f32_e32 v33, v37, v36
-; GFX9-NEXT:    v_max_f32_e32 v37, v51, v50
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    v_max_f32_e32 v34, v39, v38
-; GFX9-NEXT:    v_max_f32_e32 v38, v53, v52
-; GFX9-NEXT:    v_max_f32_e32 v50, v51, v50
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v51, v52, v51
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v39, v55, v54
-; GFX9-NEXT:    v_max_f32_e32 v52, v53, v52
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v53, v54, v53
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
-; GFX9-NEXT:    v_max_f32_e32 v36, v49, v48
-; GFX9-NEXT:    v_max_f32_e32 v48, v41, v40
-; GFX9-NEXT:    v_max_f32_e32 v54, v55, v54
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-NEXT:    v_max_f32_e32 v55, v40, v55
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
-; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX9-NEXT:    v_max_f32_e32 v49, v42, v49
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v35
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v17
+; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v17
+; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v17
+; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v17
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v17
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v17
+; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v17
+; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v17
+; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v17
+; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v17
+; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v17
+; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v17
+; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
-; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
-; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
-; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
-; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_max_f32_e32 v53, v54, v53
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_max_f32_e32 v55, v64, v55
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_max_f32_e32 v65, v66, v65
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_max_f32_e32 v67, v68, v67
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
-; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
-; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
-; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
-; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
-; GFX10-NEXT:    v_max_f32_e32 v32, v34, v32
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_max_f32_e32 v49, v50, v49
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_max_f32_e32 v51, v52, v51
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v53, v52
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_max_f32_e32 v22, v55, v54
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX10-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX10-NEXT:    v_max_f32_e32 v34, v35, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
-; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
-; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
-; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_max_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_max_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_max_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_max_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
+; GFX10-NEXT:    v_max_f32_e32 v36, v37, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
+; GFX10-NEXT:    v_max_f32_e32 v38, v39, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; GFX10-NEXT:    v_max_f32_e32 v48, v49, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX10-NEXT:    v_max_f32_e32 v50, v51, v50
+; GFX10-NEXT:    v_max_f32_e32 v23, v65, v64
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_max_f32_e32 v24, v67, v66
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_max_f32_e32 v25, v33, v68
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_max_f32_e32 v16, v35, v16
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_max_f32_e32 v17, v37, v17
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_max_f32_e32 v18, v39, v18
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_max_f32_e32 v19, v49, v19
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_max_f32_e32 v16, v33, v16
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; GFX10-NEXT:    v_max_f32_e32 v20, v20, v21
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v22
+; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
-; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
-; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
-; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
-; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
-; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
-; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
-; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
-; GFX11-NEXT:    v_dual_max_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
-; GFX11-NEXT:    v_dual_max_f32 v34, v36, v35 :: v_dual_max_f32 v35, v38, v37
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
-; GFX11-NEXT:    v_dual_max_f32 v36, v48, v39 :: v_dual_max_f32 v37, v50, v49
-; GFX11-NEXT:    v_max_f32_e32 v39, v54, v53
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_max_f32 v1, v1, v17
-; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
-; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
-; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
-; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
-; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
-; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
-; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
-; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
-; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
-; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v38, v52, v51 :: v_dual_max_f32 v53, v82, v81
-; GFX11-NEXT:    v_dual_max_f32 v48, v64, v55 :: v_dual_max_f32 v55, v86, v85
-; GFX11-NEXT:    v_dual_max_f32 v49, v66, v65 :: v_dual_max_f32 v50, v68, v67
-; GFX11-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX11-NEXT:    v_dual_max_f32 v51, v70, v69 :: v_dual_max_f32 v52, v80, v71
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_max_f32 v54, v84, v83
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_max_f32 v14, v14, v30
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_max_f32 v12, v12, v28
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_max_f32 v8, v8, v24
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_max_f32 v4, v4, v20
-; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_max_f32 v2, v2, v18
-; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_dual_max_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_max_f32 v9, v9, v25
+; GFX11-NEXT:    v_max_f32_e32 v25, v69, v68
+; GFX11-NEXT:    v_dual_max_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
+; GFX11-NEXT:    v_max_f32_e32 v27, v81, v80
+; GFX11-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX11-NEXT:    v_dual_max_f32 v28, v83, v82 :: v_dual_max_f32 v29, v85, v84
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_max_f32_e32 v22, v55, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_max_f32_e32 v23, v65, v64
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_max_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
+; GFX11-NEXT:    v_max_f32_e32 v18, v39, v38
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_max_f32_e32 v19, v49, v48
+; GFX11-NEXT:    v_max_f32_e32 v17, v37, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v21, v53, v52
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_max_f32_e32 v16, v35, v34
+; GFX11-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
 ; GFX11-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v16, v32, v16
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
   ret <32 x bfloat> %op
@@ -18653,7 +17554,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX8-LABEL: v_sqrt_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0xf800000
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -18672,13 +17573,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sqrt_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0xf800000
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -18697,13 +17598,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sqrt_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -18720,13 +17621,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sqrt_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
@@ -18751,7 +17652,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
   ret bfloat %op
@@ -18779,34 +17680,34 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX8-LABEL: v_ldexp_bf16_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ldexp_bf16_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ldexp_bf16_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ldexp_bf16_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
   ret bfloat %op
@@ -18840,28 +17741,28 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX8-LABEL: v_frexp_bf16_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_frexp_bf16_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
-; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_frexp_bf16_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
   ret { bfloat, i16 } %op
@@ -18929,7 +17830,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-LABEL: v_log_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -18951,13 +17852,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -18976,13 +17877,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -18995,13 +17896,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
@@ -19020,7 +17921,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log.bf16(bfloat %a)
   ret bfloat %op
@@ -19062,7 +17963,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-LABEL: v_log2_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -19072,52 +17973,52 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log2_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log2_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log2_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log2.bf16(bfloat %a)
   ret bfloat %op
@@ -19180,7 +18081,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-LABEL: v_log10_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -19202,13 +18103,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log10_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -19227,13 +18128,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log10_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -19246,13 +18147,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log10_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
@@ -19271,7 +18172,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log10.bf16(bfloat %a)
   ret bfloat %op
@@ -19337,7 +18238,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX8-LABEL: v_exp_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8a000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v3
@@ -19358,23 +18259,23 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; GFX9-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_fma_f32 v1, v0, s4, -v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x32a5705f
-; GFX9-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_fma_f32 v1, v0, s4, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x42b17218
@@ -19383,44 +18284,44 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
-; GFX10-NEXT:    v_fma_f32 v2, 0x3fb8aa3b, v0, -v1
-; GFX10-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x32a5705f, v2
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_fmamk_f32 v3, v0, 0x32a5705f, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX11-NEXT:    v_fma_f32 v2, 0x3fb8aa3b, v0, -v1
-; GFX11-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX11-NEXT:    v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX11-NEXT:    v_fmamk_f32 v3, v0, 0x32a5705f, v3
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
-; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x32a5705f, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -19429,7 +18330,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp.bf16(bfloat %a)
   ret bfloat %op
@@ -19471,7 +18372,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX8-LABEL: v_exp2_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -19481,52 +18382,52 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp2_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp2_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp2_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp2.bf16(bfloat %a)
   ret bfloat %op
@@ -19588,7 +18489,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX8-LABEL: v_exp10_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x40549000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v3
@@ -19609,23 +18510,23 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp10_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x40549a78
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x40549a78
+; GFX9-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_fma_f32 v1, v0, s4, -v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x33979a37
-; GFX9-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_fma_f32 v1, v0, s4, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc23369f4
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x421a209b
@@ -19634,44 +18535,44 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp10_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
-; GFX10-NEXT:    v_fma_f32 v2, 0x40549a78, v0, -v1
-; GFX10-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x33979a37, v2
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, 0x40549a78, v0, -v1
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_fmamk_f32 v3, v0, 0x33979a37, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp10_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
-; GFX11-NEXT:    v_fma_f32 v2, 0x40549a78, v0, -v1
-; GFX11-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX11-NEXT:    v_fma_f32 v3, 0x40549a78, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX11-NEXT:    v_fmamk_f32 v3, v0, 0x33979a37, v3
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
-; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x33979a37, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -19680,7 +18581,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp10.bf16(bfloat %a)
   ret bfloat %op
@@ -19708,34 +18609,34 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX8-LABEL: v_ceil_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ceil_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ceil_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ceil_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.ceil.bf16(bfloat %a)
   ret bfloat %op
@@ -19763,34 +18664,34 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX8-LABEL: v_trunc_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_trunc_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_trunc_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_trunc_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.trunc.bf16(bfloat %a)
   ret bfloat %op
@@ -19818,34 +18719,34 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX8-LABEL: v_rint_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rint_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rint_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rint_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.rint.bf16(bfloat %a)
   ret bfloat %op
@@ -19873,34 +18774,34 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX8-LABEL: v_nearbyint_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_nearbyint_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_nearbyint_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_nearbyint_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
   ret bfloat %op
@@ -19940,7 +18841,7 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX8-LABEL: v_round_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -19948,13 +18849,13 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_round_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -19962,26 +18863,26 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_round_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_round_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -19992,7 +18893,7 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.round.bf16(bfloat %a)
   ret bfloat %op
@@ -20020,34 +18921,34 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX8-LABEL: v_roundeven_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_roundeven_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
   ret bfloat %op
@@ -20075,34 +18976,34 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX8-LABEL: v_floor_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_floor_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_floor_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_floor_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_floor_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_floor_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_floor_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_floor_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.floor.bf16(bfloat %a)
   ret bfloat %op
@@ -20124,21 +19025,34 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX8-LABEL: v_canonicalize_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_canonicalize_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_canonicalize_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_canonicalize_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
   ret bfloat %op
@@ -20214,8 +19128,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_oeq_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20223,8 +19137,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_oeq_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20232,8 +19146,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_oeq_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20241,8 +19155,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_oeq_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20273,8 +19187,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ogt_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20282,8 +19196,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ogt_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20291,8 +19205,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ogt_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20300,8 +19214,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ogt_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20332,8 +19246,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_oge_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20341,8 +19255,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_oge_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20350,8 +19264,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_oge_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20359,8 +19273,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_oge_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20391,8 +19305,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_olt_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20400,8 +19314,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_olt_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20409,8 +19323,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_olt_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20418,8 +19332,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_olt_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20450,8 +19364,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ole_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20459,8 +19373,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ole_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20468,8 +19382,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ole_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20477,8 +19391,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ole_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20509,8 +19423,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_one_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20518,8 +19432,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_one_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20527,8 +19441,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_one_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20536,8 +19450,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_one_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20568,8 +19482,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_uno_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20577,8 +19491,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_uno_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20586,8 +19500,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_uno_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20595,8 +19509,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_uno_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20627,8 +19541,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ueq_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20636,8 +19550,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ueq_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20645,8 +19559,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ueq_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20654,8 +19568,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ueq_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20686,8 +19600,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ugt_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20695,8 +19609,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ugt_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20704,8 +19618,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ugt_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20713,8 +19627,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ugt_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20745,8 +19659,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_uge_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20754,8 +19668,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_uge_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20763,8 +19677,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_uge_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20772,8 +19686,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_uge_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20804,8 +19718,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ult_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20813,8 +19727,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ult_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20822,8 +19736,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ult_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20831,8 +19745,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ult_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20863,8 +19777,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ule_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20872,8 +19786,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ule_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20881,8 +19795,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ule_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20890,8 +19804,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ule_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20922,8 +19836,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_une_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20931,8 +19845,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_une_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20940,8 +19854,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_une_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20949,8 +19863,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_une_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -21025,41 +19939,34 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
 ; GFX8-LABEL: v_copysign_bf16_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -21089,45 +19996,36 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
 ; GFX8-LABEL: v_copysign_bf16_s_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s4, 0x80000000
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_s_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX9-NEXT:    s_and_b32 s4, s4, 0x80000000
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_s_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10-NEXT:    s_and_b32 s4, s4, 0x80000000
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e64 v1, 0xffff8000, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_s_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_and_b32 s0, s0, 0x80000000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_and_b32_e64 v1, 0xffff8000, s0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -21157,43 +20055,36 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
 ; GFX8-LABEL: v_copysign_s_bf16_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_s_bf16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_s_bf16_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s4
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_s_bf16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -21223,41 +20114,35 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
 ; GFX8-LABEL: v_copysign_bf16_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fptrunc float %sign.f32 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -21288,41 +20173,35 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
 ; GFX8-LABEL: v_copysign_bf16_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fptrunc double %sign.f64 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -21353,40 +20232,34 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
 ; GFX8-LABEL: v_copysign_bf16_f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = bitcast half %sign.f16 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -21412,50 +20285,43 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign
 ;
 ; GFX8-LABEL: s_copysign_bf16_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_bf16_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_bf16_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
+; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_bf16_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
+; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -21484,7 +20350,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX8-LABEL: s_copysign_bf16_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    s_and_b32 s0, s1, 0x80000000
@@ -21496,7 +20361,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX9-LABEL: s_copysign_bf16_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    s_and_b32 s0, s1, 0x80000000
@@ -21508,7 +20372,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX10-LABEL: s_copysign_bf16_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
 ; GFX10-NEXT:    s_and_b32 s0, s1, 0x80000000
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
@@ -21519,10 +20382,9 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX11-LABEL: s_copysign_bf16_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -21557,7 +20419,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX8-LABEL: s_copysign_bf16_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    s_and_b32 s0, s2, 0x80000000
@@ -21569,7 +20430,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX9-LABEL: s_copysign_bf16_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    s_and_b32 s0, s2, 0x80000000
@@ -21581,7 +20441,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX10-LABEL: s_copysign_bf16_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
 ; GFX10-NEXT:    s_and_b32 s0, s2, 0x80000000
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
@@ -21592,10 +20451,9 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX11-LABEL: s_copysign_bf16_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s2, 0x80000000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -21632,7 +20490,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX8-LABEL: s_copysign_bf16_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
@@ -21644,7 +20501,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX9-LABEL: s_copysign_bf16_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX9-NEXT:    v_and_b32_e32 v0, s1, v0
@@ -21656,7 +20512,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX10-LABEL: s_copysign_bf16_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
 ; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -21666,7 +20521,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX11-LABEL: s_copysign_bf16_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
 ; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -21703,6 +20557,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
 ; GFX8-LABEL: v_copysign_f32_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -21710,6 +20565,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
 ; GFX9-LABEL: v_copysign_f32_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -21717,12 +20573,15 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
 ; GFX10-LABEL: v_copysign_f32_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f32_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fpext bfloat %sign.bf16 to float
@@ -21751,32 +20610,32 @@ define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.b
 ;
 ; GFX8-LABEL: s_copysign_f32_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s2, -2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
+; GFX8-NEXT:    s_brev_b32 s1, -2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_f32_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
+; GFX9-NEXT:    s_brev_b32 s1, -2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_f32_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_f32_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -21816,7 +20675,6 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
 ; GFX8-LABEL: v_copysign_f16_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -21824,7 +20682,6 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
 ; GFX9-LABEL: v_copysign_f16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -21832,15 +20689,12 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
 ; GFX10-LABEL: v_copysign_f16_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = bitcast bfloat %sign.bf16 to half
@@ -21875,7 +20729,6 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf
 ;
 ; GFX8-LABEL: s_copysign_f16_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -21886,7 +20739,6 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf
 ;
 ; GFX9-LABEL: s_copysign_f16_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -21897,7 +20749,6 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf
 ;
 ; GFX10-LABEL: s_copysign_f16_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -21906,12 +20757,11 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf
 ;
 ; GFX11-LABEL: s_copysign_f16_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %sign = bitcast bfloat %sign.bf16 to half
@@ -21942,6 +20792,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
 ; GFX8-LABEL: v_copysign_f64_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -21949,6 +20800,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
 ; GFX9-LABEL: v_copysign_f64_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -21956,12 +20808,15 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
 ; GFX10-LABEL: v_copysign_f64_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f64_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fpext bfloat %sign.bf16 to double
@@ -21990,32 +20845,32 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg
 ;
 ; GFX8-LABEL: s_copysign_f64_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s3, -2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    v_bfi_b32 v0, s3, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
+; GFX8-NEXT:    s_brev_b32 s2, -2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_bfi_b32 v0, s2, v1, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_f64_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_brev_b32 s3, -2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_bfi_b32 v0, s3, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_bfi_b32 v0, s2, v1, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_f64_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, s1, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_f64_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s1, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
@@ -22050,28 +20905,28 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
 ; GFX8-LABEL: v_fptosi_bf16_to_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_bf16_to_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_bf16_to_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fptosi_bf16_to_i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -22188,13 +21043,13 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
@@ -22285,65 +21140,65 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi <4 x bfloat> %x to <4 x i16>
   ret <4 x i16> %op
@@ -22367,28 +21222,28 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
 ; GFX8-LABEL: v_fptosi_bf16_to_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_bf16_to_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_bf16_to_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fptosi_bf16_to_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -22663,7 +21518,7 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
 ; GFX8-LABEL: v_fptosi_bf16_to_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
 ; GFX8-NEXT:    v_mul_f32_e64 v1, |v0|, s4
@@ -22682,26 +21537,26 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
 ; GFX9-LABEL: v_fptosi_bf16_to_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
 ; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
 ; GFX9-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v2, v1, s4, |v0|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, s4, |v0|
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v3
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_bf16_to_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
@@ -22718,7 +21573,7 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
 ; GFX11-LABEL: v_fptosi_bf16_to_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
@@ -23557,21 +22412,21 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i16_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i16_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i16_to_bf16:
@@ -23580,7 +22435,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i16 %x to bfloat
   ret bfloat %op
@@ -23614,8 +22469,8 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
@@ -23623,8 +22478,8 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
@@ -23632,7 +22487,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16:
@@ -23644,7 +22499,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i16> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -23682,23 +22537,26 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -23706,9 +22564,10 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -23716,15 +22575,17 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
@@ -23768,59 +22629,55 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v1
+; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -23845,21 +22702,21 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i32_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i32_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i32_to_bf16:
@@ -23867,7 +22724,7 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i32 %x to bfloat
   ret bfloat %op
@@ -23897,34 +22754,34 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i32> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -23956,45 +22813,34 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_sitofp_v3i32_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -24029,53 +22875,49 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_perm_b32 v1, v2, v3, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i32> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -24133,7 +22975,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i64_to_bf16:
@@ -24151,7 +22993,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i64_to_bf16:
@@ -24169,7 +23011,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i64_to_bf16:
@@ -24192,7 +23034,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i64 %x to bfloat
   ret bfloat %op
@@ -24264,72 +23106,72 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v5, v2, v3
-; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
 ; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v5, v0
+; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
+; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v1, v2, v4
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
+; GFX8-NEXT:    v_ldexp_f32 v1, v5, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v5, v2, v3
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v4, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
 ; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
 ; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v4, v2, v3
-; GFX10-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v6, v3
-; GFX10-NEXT:    v_ffbh_i32_e32 v7, v1
+; GFX10-NEXT:    v_xor_b32_e32 v4, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v7, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
@@ -24338,28 +23180,28 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 32, v5
 ; GFX10-NEXT:    v_min_u32_e32 v4, v6, v4
 ; GFX10-NEXT:    v_min_u32_e32 v5, v7, v5
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v4, v2, v3
-; GFX11-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX11-NEXT:    v_cls_i32_e32 v6, v3
-; GFX11-NEXT:    v_cls_i32_e32 v7, v1
+; GFX11-NEXT:    v_xor_b32_e32 v4, v0, v1
+; GFX11-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX11-NEXT:    v_cls_i32_e32 v6, v1
+; GFX11-NEXT:    v_cls_i32_e32 v7, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
@@ -24373,24 +23215,24 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    v_min_u32_e32 v4, v6, v4
 ; GFX11-NEXT:    v_min_u32_e32 v5, v7, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i64> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -24488,188 +23330,133 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v7, v4, v5
-; GFX8-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX8-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
 ; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_xor_b32_e32 v6, v2, v3
-; GFX8-NEXT:    v_ldexp_f32 v5, v4, v5
-; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 32, v6
-; GFX8-NEXT:    v_min_u32_e32 v6, v4, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v7, v0
+; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
+; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v1, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v2, v7, v2
+; GFX8-NEXT:    v_ldexp_f32 v3, v0, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, v4, v5
+; GFX8-NEXT:    v_ffbh_i32_e32 v0, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
+; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v7, v4, v5
-; GFX9-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
-; GFX9-NEXT:    v_xor_b32_e32 v6, v2, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
-; GFX9-NEXT:    v_ldexp_f32 v5, v4, v5
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
-; GFX9-NEXT:    v_min_u32_e32 v6, v4, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v1, v3, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, v0
+; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_ldexp_f32 v2, v7, v6
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, v4, v5
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v0
+; GFX9-NEXT:    v_ffbh_i32_e32 v0, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v3, v3, v6
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX10-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX10-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v8, v2, v3
+; GFX10-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX10-NEXT:    v_ffbh_i32_e32 v11, v1
-; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX10-NEXT:    v_ffbh_i32_e32 v11, v5
+; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_min_u32_e32 v7, v10, v7
-; GFX10-NEXT:    v_min_u32_e32 v9, v11, v9
-; GFX10-NEXT:    v_min_u32_e32 v6, v6, v8
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX10-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX10-NEXT:    v_min_u32_e32 v7, v10, v8
+; GFX10-NEXT:    v_min_u32_e32 v8, v11, v9
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v6
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_sitofp_v3i64_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX11-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX11-NEXT:    v_cls_i32_e32 v6, v5
-; GFX11-NEXT:    v_cls_i32_e32 v10, v3
-; GFX11-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX11-NEXT:    v_cls_i32_e32 v11, v1
-; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
-; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, -1, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 32, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, -1, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v7, v10, v7
-; GFX11-NEXT:    v_min_u32_e32 v9, v11, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v6, v6, v8
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -24792,234 +23579,236 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v9, v6, v7
-; GFX8-NEXT:    v_ffbh_i32_e32 v8, v7
-; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
-; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v8, v[6:7]
-; GFX8-NEXT:    v_xor_b32_e32 v9, v4, v5
-; GFX8-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 32, v8
-; GFX8-NEXT:    v_ffbh_i32_e32 v8, v5
+; GFX8-NEXT:    v_xor_b32_e32 v9, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v8, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
 ; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v6, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
-; GFX8-NEXT:    v_xor_b32_e32 v8, v2, v3
-; GFX8-NEXT:    v_ldexp_f32 v6, v6, v7
-; GFX8-NEXT:    v_ffbh_i32_e32 v7, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, -1, v7
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v8
-; GFX8-NEXT:    v_min_u32_e32 v7, v7, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v9, v0
+; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
+; GFX8-NEXT:    v_min_u32_e32 v10, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
-; GFX8-NEXT:    v_ldexp_f32 v1, v2, v5
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_perm_b32 v1, v4, v6, s4
+; GFX8-NEXT:    v_ldexp_f32 v3, v9, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v10
+; GFX8-NEXT:    v_xor_b32_e32 v2, v4, v5
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v1, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
+; GFX8-NEXT:    v_min_u32_e32 v8, v1, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_xor_b32_e32 v2, v6, v7
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v1, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
+; GFX8-NEXT:    v_min_u32_e32 v4, v1, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_ldexp_f32 v2, v3, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v9, v4, v5
-; GFX9-NEXT:    v_ffbh_i32_e32 v8, v5
+; GFX9-NEXT:    v_xor_b32_e32 v9, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v8, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
 ; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
 ; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v7
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v4
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v7
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_xor_b32_e32 v8, v2, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_ffbh_i32_e32 v7, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-NEXT:    v_min_u32_e32 v10, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v9
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX10-NEXT:    v_ffbh_i32_e32 v9, v5
-; GFX10-NEXT:    v_xor_b32_e32 v11, v6, v7
-; GFX10-NEXT:    v_ffbh_i32_e32 v10, v7
-; GFX10-NEXT:    v_xor_b32_e32 v13, v2, v3
-; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
-; GFX10-NEXT:    v_xor_b32_e32 v15, v0, v1
-; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
-; GFX10-NEXT:    v_ffbh_i32_e32 v12, v3
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v3, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v10
+; GFX9-NEXT:    v_xor_b32_e32 v2, v4, v5
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v1, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX9-NEXT:    v_min_u32_e32 v8, v1, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, v2, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, v6, v7
+; GFX9-NEXT:    v_ffbh_i32_e32 v1, v7
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX9-NEXT:    v_min_u32_e32 v4, v1, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v9, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
+; GFX10-NEXT:    v_xor_b32_e32 v11, v2, v3
+; GFX10-NEXT:    v_xor_b32_e32 v13, v4, v5
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX10-NEXT:    v_xor_b32_e32 v15, v6, v7
+; GFX10-NEXT:    v_ffbh_i32_e32 v12, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v14, v7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_ffbh_i32_e32 v14, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, -1, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, -1, v14
 ; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v10
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
-; GFX10-NEXT:    v_add_nc_u32_e32 v14, -1, v14
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, 32, v13
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX10-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v10, v12, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, v14, v13
-; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
-; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v1, 1, v4
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v6
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v5
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX10-NEXT:    v_or_b32_e32 v4, v7, v4
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v5
-; GFX10-NEXT:    v_ldexp_f32 v3, v4, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v7
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_ldexp_f32 v3, v4, v7
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX11-NEXT:    v_cls_i32_e32 v9, v5
-; GFX11-NEXT:    v_xor_b32_e32 v11, v6, v7
-; GFX11-NEXT:    v_cls_i32_e32 v10, v7
-; GFX11-NEXT:    v_xor_b32_e32 v13, v2, v3
+; GFX11-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX11-NEXT:    v_cls_i32_e32 v9, v1
+; GFX11-NEXT:    v_cls_i32_e32 v10, v3
+; GFX11-NEXT:    v_xor_b32_e32 v11, v2, v3
+; GFX11-NEXT:    v_xor_b32_e32 v13, v4, v5
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v9
-; GFX11-NEXT:    v_xor_b32_e32 v14, v0, v1
-; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
-; GFX11-NEXT:    v_cls_i32_e32 v12, v3
+; GFX11-NEXT:    v_xor_b32_e32 v15, v6, v7
+; GFX11-NEXT:    v_cls_i32_e32 v12, v5
+; GFX11-NEXT:    v_cls_i32_e32 v14, v7
 ; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, -1, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
-; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
-; GFX11-NEXT:    v_cls_i32_e32 v13, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 32, v14
-; GFX11-NEXT:    v_min_u32_e32 v10, v10, v11
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
-; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_min_u32_e32 v5, 1, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v10
+; GFX11-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
+; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 32, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 32, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v9, v9, v11
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_min_u32_e32 v10, v12, v10
+; GFX11-NEXT:    v_min_u32_e32 v11, v14, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v1, v4, v6
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v5
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v10
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v1, 1, v4
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX11-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
-; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
 ; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11-NEXT:    v_ldexp_f32 v3, v4, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -25046,21 +23835,21 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i16_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i16_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i16_to_bf16:
@@ -25069,7 +23858,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i16 %x to bfloat
   ret bfloat %op
@@ -25103,37 +23892,37 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i16> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -25171,49 +23960,55 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
@@ -25257,59 +24052,55 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -25334,21 +24125,21 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i32_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i32_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i32_to_bf16:
@@ -25356,7 +24147,7 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i32 %x to bfloat
   ret bfloat %op
@@ -25386,34 +24177,34 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i32> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -25445,45 +24236,34 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_uitofp_v3i32_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -25518,53 +24298,49 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_perm_b32 v1, v2, v3, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i32> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -25610,7 +24386,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i64_to_bf16:
@@ -25624,7 +24400,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i64_to_bf16:
@@ -25638,7 +24414,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i64_to_bf16:
@@ -25656,7 +24432,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i64 %x to bfloat
   ret bfloat %op
@@ -25712,45 +24488,45 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
 ; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v4
-; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_ffbh_u32_e32 v3, v1
-; GFX8-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, v0
+; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_ldexp_f32 v1, v2, v4
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
+; GFX8-NEXT:    v_ldexp_f32 v1, v5, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v1
 ; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
-; GFX9-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
 ; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -25758,52 +24534,52 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v4, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
 ; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i64> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -25877,139 +24653,97 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX8-NEXT:    v_ffbh_u32_e32 v6, v1
 ; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v5, v4, v5
-; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
-; GFX8-NEXT:    v_min_u32_e32 v6, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
-; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v1, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v2, v7, v2
+; GFX8-NEXT:    v_ldexp_f32 v3, v0, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v0, v5
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v5, v4, v5
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v3
-; GFX9-NEXT:    v_min_u32_e32 v6, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v1
-; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v1, v3, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_ldexp_f32 v2, v7, v6
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v5
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v3, v3, v6
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v6, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v7, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX10-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX10-NEXT:    v_min_u32_e32 v7, 32, v7
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v6
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_uitofp_v3i64_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v6, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v7, v1
-; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX11-NEXT:    v_min_u32_e32 v7, 32, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v8
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -26100,131 +24834,130 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v8, v7
+; GFX8-NEXT:    v_ffbh_u32_e32 v8, v1
 ; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v8, v[6:7]
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, v6
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 32, v8
-; GFX8-NEXT:    v_ffbh_u32_e32 v8, v5
-; GFX8-NEXT:    v_ldexp_f32 v6, v6, v7
-; GFX8-NEXT:    v_ffbh_u32_e32 v7, v3
-; GFX8-NEXT:    v_min_u32_e32 v7, 32, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_ffbh_u32_e32 v3, v1
-; GFX8-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, v0
+; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX8-NEXT:    v_min_u32_e32 v10, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v10
+; GFX8-NEXT:    v_ldexp_f32 v3, v9, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v1, v5
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v1, v7
+; GFX8-NEXT:    v_min_u32_e32 v4, 32, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
-; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
-; GFX8-NEXT:    v_ldexp_f32 v1, v2, v5
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_perm_b32 v1, v4, v6, s4
+; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT:    v_ldexp_f32 v2, v3, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v1
 ; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, v4
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
-; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT:    v_ffbh_u32_e32 v7, v3
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
-; GFX9-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX9-NEXT:    v_min_u32_e32 v10, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v9
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v3, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v10
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    v_ffbh_u32_e32 v1, v5
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, v2, v1
+; GFX9-NEXT:    v_ffbh_u32_e32 v1, v7
+; GFX9-NEXT:    v_min_u32_e32 v4, 32, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
-; GFX10-NEXT:    v_ffbh_u32_e32 v9, v7
-; GFX10-NEXT:    v_ffbh_u32_e32 v10, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v11, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v9, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX10-NEXT:    v_ffbh_u32_e32 v11, v7
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v5
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v4, v7, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v9
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
-; GFX10-NEXT:    v_ldexp_f32 v3, v3, v4
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v8
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_ldexp_f32 v3, v4, v6
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
-; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v7
-; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v3
+; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v5
+; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
@@ -26232,40 +24965,37 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v5
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v11
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v8
-; GFX11-NEXT:    v_ldexp_f32 v3, v3, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v7, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v9
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v8
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v3, v4, v6
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -26294,46 +25024,33 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, bfloat %a, bfloat %b
   ret bfloat %op
@@ -26364,50 +25081,37 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_lhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_lhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg bfloat %a
   %op = select i1 %cond, bfloat %neg.a, bfloat %b
@@ -26439,50 +25143,37 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_rhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_rhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg bfloat %b
   %op = select i1 %cond, bfloat %a, bfloat %neg.b
@@ -26524,11 +25215,11 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -26537,13 +25228,13 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v2bf16:
@@ -26555,8 +25246,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v2bf16:
@@ -26568,11 +25258,8 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %op
@@ -26611,11 +25298,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -26626,28 +25313,27 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v2bf16:
@@ -26660,12 +25346,9 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %op
@@ -26694,8 +25377,6 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ;
 ; GFX8-LABEL: s_select_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -26706,8 +25387,6 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ;
 ; GFX9-LABEL: s_select_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -26718,25 +25397,21 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ;
 ; GFX10-LABEL: s_select_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
@@ -26810,23 +25485,22 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s1, v1, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -26838,11 +25512,10 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, s1, v2 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
@@ -26913,9 +25586,9 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -26924,13 +25597,12 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s0
+; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -26943,11 +25615,10 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, s1, v3 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq <2 x i32> %c, zeroinitializer
@@ -27000,30 +25671,37 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v3bf16:
@@ -27032,13 +25710,15 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v3bf16:
@@ -27046,17 +25726,17 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v2, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
   ret <3 x bfloat> %op
@@ -27113,38 +25793,37 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v4bf16:
@@ -27153,17 +25832,15 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v4bf16:
@@ -27173,20 +25850,15 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v0, v7, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v2, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v4
+; GFX11-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -27259,105 +25931,92 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v12, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v6bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v6bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v5, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v6bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v3, v6, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v2 :: v_dual_cndmask_b32 v5, v10, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v7 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v5, v3, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
   ret <6 x bfloat> %op
@@ -27446,62 +26105,59 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v15, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v14, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v11, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v14, v13, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v15, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v8bf16:
@@ -27510,63 +26166,52 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v15, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v6, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v7, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v13, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v10, v9 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v11 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_and_b32 v12, 0xffff, v13
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v5, v6, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v7, v12, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v5
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v2, v12, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v6, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v7, v4, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -27727,110 +26372,103 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v0, v20, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v0, v19, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
-; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
-; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v0, v20, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v0, v19, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
-; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
-; GFX9-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
-; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
-; GFX9-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v16bf16:
@@ -27839,53 +26477,45 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v0, v27, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v19, v0, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; GFX10-NEXT:    v_perm_b32 v0, v9, v1, 0x5040100
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_perm_b32 v1, v10, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v18, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v20, v19, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v29, v28, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v17, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v31, v30, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v26, v25, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v28
-; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v7, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v9, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v6, v10, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v11, v7, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v12, v8, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v16bf16:
@@ -27894,55 +26524,45 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v0, v27 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v3, v11, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v20, v19 :: v_dual_lshlrev_b32 v9, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v22, v21, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v28, v29, v28 :: v_dual_cndmask_b32 v15, v31, v30
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v26, v25 :: v_dual_and_b32 v3, 0xffff, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v24, v23 :: v_dual_lshlrev_b32 v11, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v19, v0, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; GFX11-NEXT:    v_perm_b32 v0, v9, v1, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX11-NEXT:    v_perm_b32 v1, v10, v2, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v11
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v28
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v9
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v10
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v12
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v18, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v12, v4 :: v_dual_cndmask_b32 v4, v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v17, v11 :: v_dual_cndmask_b32 v10, v14, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_cndmask_b32 v8, v16, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v17, v14, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v9, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v6, v10, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v11, v7, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v12, v8, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -28360,106 +26980,106 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v19, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v0, v33, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
-; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
-; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
-; GFX8-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v27
-; GFX8-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v28
-; GFX8-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
-; GFX8-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v30
-; GFX8-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
-; GFX8-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
-; GFX8-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
-; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v32bf16:
@@ -28467,106 +27087,91 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v21
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v19, v8, vcc
+; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
+; GFX9-NEXT:    v_perm_b32 v9, v10, v9, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX9-NEXT:    v_perm_b32 v10, v11, v10, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
+; GFX9-NEXT:    v_perm_b32 v11, v12, v11, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
+; GFX9-NEXT:    v_perm_b32 v12, v13, v12, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
+; GFX9-NEXT:    v_perm_b32 v13, v14, v13, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v33, v0, v33, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; GFX9-NEXT:    v_perm_b32 v14, v15, v14, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
-; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
-; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
-; GFX9-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
-; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
-; GFX9-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
-; GFX9-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v27
-; GFX9-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v28
-; GFX9-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
-; GFX9-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v30
-; GFX9-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
-; GFX9-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
-; GFX9-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
-; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX9-NEXT:    v_perm_b32 v15, v16, v15, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v32bf16:
@@ -28576,19 +27181,10 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v29
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v30
-; GFX10-NEXT:    v_cndmask_b32_e32 v67, v68, v67, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
@@ -28599,96 +27195,103 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v64, v65, v64, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v68, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v34, v33, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v35, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v30, v30, v65, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v55, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v54, v53, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v52, v51, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v50, v49, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v48, v39, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v38, v37, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v28, v65, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v52, v51, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v54, v53, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v30
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v64, v55, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v0, v65, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v68, v66, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v64
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v67
-; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v7, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v8, v9, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v9, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v10, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v11, v12, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v12, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v13, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v67, v66, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v68, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v33, v1, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v35, v2, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v37, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v39, v4, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v49, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v22, v6, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v23, v7, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v34, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v38, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v7, v24, v8, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v25, v9, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v26, v10, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v17, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v18, v12, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v19, v13, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v20, v14, 0x5040100
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v31
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v32
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v32
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v65, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_or_b32_sdwa v14, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v15, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v22, v48, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v14, v17, v15, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v15, v21, v16, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v34, v33, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, v36, v35, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
@@ -28704,92 +27307,46 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v16
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11
+; GFX11-NEXT:    v_dual_cndmask_b32 v27, v70, v69 :: v_dual_cndmask_b32 v12, v28, v12
+; GFX11-NEXT:    v_dual_cndmask_b32 v28, v80, v71 :: v_dual_cndmask_b32 v13, v29, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, v82, v81, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v30, v82, v81 :: v_dual_cndmask_b32 v11, v27, v11
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v28, v12 :: v_dual_cndmask_b32 v7, v23, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v28, v70, v69 :: v_dual_cndmask_b32 v27, v68, v67
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v26, v66, v65 :: v_dual_cndmask_b32 v23, v52, v51
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v24, v8 :: v_dual_cndmask_b32 v5, v21, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v54, v53 :: v_dual_cndmask_b32 v21, v48, v39
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v1, v17, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v50, v49 :: v_dual_cndmask_b32 v3, v19, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v20, v4 :: v_dual_cndmask_b32 v17, v34, v33
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v38, v37, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v36, v35 :: v_dual_lshlrev_b32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_dual_cndmask_b32 v13, v29, v13 :: v_dual_and_b32 v12, 0xffff, v12
-; GFX11-NEXT:    v_dual_cndmask_b32 v29, v80, v71 :: v_dual_and_b32 v14, 0xffff, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v25, v64, v55 :: v_dual_and_b32 v10, 0xffff, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, v0, v83, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v17, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v18, v2, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v19, v38, v37 :: v_dual_cndmask_b32 v4, v20, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v20, v48, v39 :: v_dual_cndmask_b32 v5, v21, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v50, v49, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v22, v52, v51 :: v_dual_cndmask_b32 v23, v54, v53
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v24, v64, v55 :: v_dual_cndmask_b32 v9, v25, v9
+; GFX11-NEXT:    v_dual_cndmask_b32 v25, v66, v65 :: v_dual_cndmask_b32 v26, v68, v67
+; GFX11-NEXT:    v_perm_b32 v2, v19, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v20, v4, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v21, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v22, v6, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v23, v7, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v24, v8, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v8, v25, v9, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v9, v26, v10, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v10, v27, v11, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v11, v28, v12, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v12, v29, v13, 0x5040100
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v31
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v31
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v31, v16 :: v_dual_cndmask_b32 v15, v32, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v87, v86 :: v_dual_cndmask_b32 v84, v85, v84
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v83, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v84
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v17
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v18
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v19
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v20
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v21
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v22
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v23
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v25
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v26
-; GFX11-NEXT:    v_or_b32_e32 v10, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v28
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v29
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v30
-; GFX11-NEXT:    v_or_b32_e32 v14, v15, v31
-; GFX11-NEXT:    v_or_b32_e32 v15, v16, v32
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v32
+; GFX11-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v16, v32, v16
+; GFX11-NEXT:    v_perm_b32 v13, v30, v14, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v17, v17, v84 :: v_dual_cndmask_b32 v18, v18, v85
+; GFX11-NEXT:    v_perm_b32 v14, v17, v15, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v15, v18, v16, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
@@ -28875,9 +27432,9 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -28889,19 +27446,18 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX10-LABEL: s_select_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s4, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_v3bf16:
@@ -28909,20 +27465,19 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s5, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, s2, v2 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
@@ -29002,21 +27557,21 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX8-LABEL: s_select_v4bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
@@ -29029,76 +27584,68 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX9-LABEL: s_select_v4bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v4bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, s2, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, s3, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_v4bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s6
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
 ; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s6, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, s3, v3 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, s3, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
@@ -29174,22 +27721,22 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX8-LABEL: s_vselect_v4bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, s7
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s2
@@ -29204,30 +27751,29 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX9-LABEL: s_vselect_v4bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -29237,55 +27783,50 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
-; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, s5, v4, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v6, s0
+; GFX10-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s4, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_vselect_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, s6, v4, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s4, v5, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v6, s0
+; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v6, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq <4 x i32> %c, zeroinitializer
@@ -29346,107 +27887,98 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX8-LABEL: v_vselect_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v3, 1, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -29530,187 +28062,173 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX8-LABEL: v_vselect_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v12, v8, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v8, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v15
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v8 :: v_dual_and_b32 v7, 1, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v23, v22 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v17, v16 :: v_dual_and_b32 v6, 1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v8 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v5, 1, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -29951,358 +28469,326 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-LABEL: v_vselect_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v32, v31, vcc
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v30, v22, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v30, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v25, v17, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v26, v18, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v7
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v29, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v30, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v27, v19, vcc
+; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
-; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v28, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v29, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v26
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v11
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v28, v20, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v29, v21, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX8-NEXT:    v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v30, v22, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v30
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v30, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v17
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v24
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v31, v23, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v31
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v23, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v10, v28, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX8-NEXT:    v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v31, vcc
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v30, v22, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v30, v12, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v25, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s6
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v26, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s6
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v7
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v27
 ; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v29, v21, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v30, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v27, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v28, v20, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v29, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v26
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v28, v20, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_perm_b32 v4, v8, v4, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v29, v21, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
 ; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v30, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v30, v22, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v30
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v17
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v24
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v31, v23, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v31
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v23, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v28, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX9-NEXT:    v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
-; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v38, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v38, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v7, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v54, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v24, v16 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
 ; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v28, v20 :: v_dual_and_b32 v15, 1, v15
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v39, v38 :: v_dual_and_b32 v8, 0xffff, v8
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v37, v36 :: v_dual_and_b32 v6, 0xffff, v6
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v39, v38 :: v_dual_and_b32 v8, 1, v8
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v16, v54, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v3, v54, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -30760,463 +29246,571 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-LABEL: v_vselect_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v23
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v22
-; GFX8-NEXT:    buffer_load_ushort v22, off, s[0:3], s32
-; GFX8-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 1, v29
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v25
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v28
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v24
-; GFX8-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 1, v27
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX8-NEXT:    v_writelane_b32 v31, s37, 5
 ; GFX8-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GFX8-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX8-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v21
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v18
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
+; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
 ; GFX8-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX8-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v17
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v16
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
 ; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX8-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v15
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v14
+; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:76
+; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX8-NEXT:    v_writelane_b32 v31, s42, 10
 ; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX8-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX8-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v13
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v12
+; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80
+; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v20
+; GFX8-NEXT:    buffer_load_ushort v20, off, s[0:3], s32
+; GFX8-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX8-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX8-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX8-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX8-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX8-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX8-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX8-NEXT:    v_writelane_b32 v31, s51, 19
+; GFX8-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX8-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX8-NEXT:    v_writelane_b32 v31, s54, 22
+; GFX8-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX8-NEXT:    v_writelane_b32 v31, s56, 24
+; GFX8-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX8-NEXT:    v_writelane_b32 v31, s58, 26
+; GFX8-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX8-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX8-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX8-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX8-NEXT:    v_writelane_b32 v31, s64, 32
 ; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_writelane_b32 v31, s65, 33
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v7
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:84
+; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_writelane_b32 v31, s66, 34
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_writelane_b32 v31, s67, 35
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v4
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
+; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v10
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v9
+; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92
+; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX8-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX8-NEXT:    v_and_b32_e32 v23, 1, v23
 ; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
-; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
-; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v22, v23, v22, s[10:11]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_cndmask_b32_e64 v23, v23, v24, s[12:13]
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:56
-; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v24, v25, v24, s[14:15]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e64 v25, v25, v26, s[16:17]
-; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:52
-; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v26, v27, v26, s[18:19]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_cndmask_b32_e64 v27, v27, v28, s[20:21]
-; GFX8-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
-; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v28, v29, v28, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_cndmask_b32_e64 v30, v29, v30, s[8:9]
-; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:64
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[4:5]
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v34, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v34, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v34, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v34, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v34, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v34, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v7
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v32, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v34, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX8-NEXT:    s_waitcnt vmcnt(13)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[64:65]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v18, v21, s[66:67]
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(13)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX8-NEXT:    s_waitcnt vmcnt(12)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[60:61]
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[62:63]
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    s_waitcnt vmcnt(13)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[56:57]
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v14, v15, s[58:59]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(11)
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v12, v13, s[54:55]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v11
+; GFX8-NEXT:    s_waitcnt vmcnt(10)
+; GFX8-NEXT:    v_and_b32_e32 v11, 1, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[52:53]
+; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v22
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v19
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v11
+; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
+; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:108
+; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:104
+; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
+; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
+; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
+; GFX8-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX8-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX8-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
+; GFX8-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
+; GFX8-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v8, s[50:51]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[48:49]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX8-NEXT:    v_readlane_b32 s67, v31, 35
+; GFX8-NEXT:    v_readlane_b32 s66, v31, 34
+; GFX8-NEXT:    v_readlane_b32 s65, v31, 33
+; GFX8-NEXT:    v_readlane_b32 s64, v31, 32
+; GFX8-NEXT:    v_readlane_b32 s63, v31, 31
+; GFX8-NEXT:    v_readlane_b32 s62, v31, 30
+; GFX8-NEXT:    v_readlane_b32 s61, v31, 29
+; GFX8-NEXT:    v_readlane_b32 s60, v31, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v31, 27
+; GFX8-NEXT:    v_readlane_b32 s58, v31, 26
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v5, v6, s[46:47]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[44:45]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v10, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[40:41]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readlane_b32 s57, v31, 25
+; GFX8-NEXT:    v_readlane_b32 s56, v31, 24
+; GFX8-NEXT:    v_readlane_b32 s55, v31, 23
+; GFX8-NEXT:    v_readlane_b32 s54, v31, 22
+; GFX8-NEXT:    v_readlane_b32 s53, v31, 21
+; GFX8-NEXT:    v_readlane_b32 s52, v31, 20
+; GFX8-NEXT:    v_readlane_b32 s51, v31, 19
+; GFX8-NEXT:    v_readlane_b32 s50, v31, 18
+; GFX8-NEXT:    v_readlane_b32 s49, v31, 17
+; GFX8-NEXT:    v_readlane_b32 s48, v31, 16
+; GFX8-NEXT:    v_readlane_b32 s47, v31, 15
+; GFX8-NEXT:    v_readlane_b32 s46, v31, 14
+; GFX8-NEXT:    v_readlane_b32 s45, v31, 13
+; GFX8-NEXT:    v_readlane_b32 s44, v31, 12
+; GFX8-NEXT:    v_readlane_b32 s43, v31, 11
+; GFX8-NEXT:    v_readlane_b32 s42, v31, 10
+; GFX8-NEXT:    v_readlane_b32 s41, v31, 9
+; GFX8-NEXT:    v_readlane_b32 s40, v31, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX8-NEXT:    s_waitcnt vmcnt(5)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[36:37]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v22, v23, s[38:39]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v10, v9, s[30:31]
+; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v25, v18, s[34:35]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v24, v16, s[28:29]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v16, v10, s[26:27]
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v11, v21, s[24:25]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[22:23]
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v19, v20, s[20:21]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[16:17]
+; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readlane_b32 s39, v31, 7
+; GFX8-NEXT:    v_readlane_b32 s38, v31, 6
+; GFX8-NEXT:    v_readlane_b32 s37, v31, 5
+; GFX8-NEXT:    v_readlane_b32 s36, v31, 4
+; GFX8-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v12, v18, s[14:15]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v18, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v13, v17, s[10:11]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v14, v16, s[6:7]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_or_b32_sdwa v14, v17, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v34, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v15, v20, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[18:19]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_or_b32_sdwa v12, v19, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
-; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GFX8-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
-; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v12, v26, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v13, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v14, v22, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v22
-; GFX9-NEXT:    v_and_b32_e32 v22, 1, v23
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v22
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX9-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v28
-; GFX9-NEXT:    v_and_b32_e32 v28, 1, v29
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v24
-; GFX9-NEXT:    v_and_b32_e32 v24, 1, v25
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v28
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v24
-; GFX9-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v26
-; GFX9-NEXT:    v_and_b32_e32 v26, 1, v27
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v26
-; GFX9-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v31, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v31, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX9-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX9-NEXT:    v_writelane_b32 v31, s51, 19
 ; GFX9-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX9-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX9-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX9-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v21
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX9-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v16
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_writelane_b32 v31, s54, 22
 ; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v14
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_writelane_b32 v31, s56, 24
 ; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v13
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v12
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_writelane_b32 v31, s58, 26
 ; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v20
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32
+; GFX9-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX9-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v31, s64, 32
+; GFX9-NEXT:    v_writelane_b32 v31, s65, 33
+; GFX9-NEXT:    v_writelane_b32 v31, s66, 34
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_writelane_b32 v31, s67, 35
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v9
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
+; GFX9-NEXT:    v_and_b32_e32 v23, 1, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
-; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, v23, v22, s[10:11]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_cndmask_b32_e64 v23, v23, v24, s[12:13]
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v24, v25, v24, s[14:15]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_cndmask_b32_e64 v25, v25, v26, s[16:17]
-; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v26, v27, v26, s[18:19]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_cndmask_b32_e64 v27, v27, v28, s[20:21]
-; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v28, v29, v28, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_cndmask_b32_e64 v30, v29, v30, s[8:9]
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[4:5]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v34, v21, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v34, v17, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v34, v15, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v34, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v34, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v34, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v7
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v32, v7, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v22
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX9-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX9-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX9-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
+; GFX9-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
+; GFX9-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v18, v21, s[66:67]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[64:65]
+; GFX9-NEXT:    s_mov_b32 s64, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s64
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[62:63]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[60:61]
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s64
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v14, v15, s[58:59]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s[56:57]
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s64
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v12, v13, s[54:55]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[52:53]
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    v_perm_b32 v3, v12, v3, s64
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v4, v5, s[50:51]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[48:49]
+; GFX9-NEXT:    v_perm_b32 v4, v4, v12, s64
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v20
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v11
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX9-NEXT:    v_readlane_b32 s67, v31, 35
+; GFX9-NEXT:    v_readlane_b32 s66, v31, 34
+; GFX9-NEXT:    v_readlane_b32 s65, v31, 33
+; GFX9-NEXT:    v_readlane_b32 s63, v31, 31
+; GFX9-NEXT:    v_readlane_b32 s62, v31, 30
+; GFX9-NEXT:    v_readlane_b32 s61, v31, 29
+; GFX9-NEXT:    v_readlane_b32 s60, v31, 28
+; GFX9-NEXT:    v_readlane_b32 s59, v31, 27
+; GFX9-NEXT:    v_readlane_b32 s58, v31, 26
+; GFX9-NEXT:    v_readlane_b32 s57, v31, 25
+; GFX9-NEXT:    v_readlane_b32 s56, v31, 24
+; GFX9-NEXT:    v_readlane_b32 s55, v31, 23
+; GFX9-NEXT:    v_readlane_b32 s54, v31, 22
+; GFX9-NEXT:    v_readlane_b32 s53, v31, 21
+; GFX9-NEXT:    v_readlane_b32 s52, v31, 20
+; GFX9-NEXT:    v_readlane_b32 s51, v31, 19
+; GFX9-NEXT:    v_readlane_b32 s50, v31, 18
+; GFX9-NEXT:    v_readlane_b32 s49, v31, 17
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v7, s[46:47]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[44:45]
+; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s64
+; GFX9-NEXT:    v_readlane_b32 s48, v31, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v31, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v31, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v31, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v31, 12
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[42:43]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[40:41]
+; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s64
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v34, v32, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v22, v23, s[38:39]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[36:37]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s64
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v20, v18, s[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[30:31]
+; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s64
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v16, s[28:29]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[26:27]
+; GFX9-NEXT:    v_perm_b32 v9, v11, v9, s64
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v10, v21, s[24:25]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s[22:23]
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    v_perm_b32 v10, v10, v11, s64
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v19, v24, s[20:21]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[16:17]
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    v_perm_b32 v11, v19, v11, s64
+; GFX9-NEXT:    v_readlane_b32 s43, v31, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v31, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v31, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v31, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v31, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v31, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v31, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v31, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v31, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v12, v18, s[14:15]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v18, s[12:13]
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v13, v17, s[10:11]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v16, s[6:7]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
+; GFX9-NEXT:    v_perm_b32 v14, v14, v17, s64
+; GFX9-NEXT:    v_perm_b32 v12, v12, v19, s64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v34, v7, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v15, v20, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[18:19]
+; GFX9-NEXT:    v_perm_b32 v13, v13, v18, s64
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s64
+; GFX9-NEXT:    v_readlane_b32 s64, v31, 32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
-; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GFX9-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
-; GFX9-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v12, v26, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v13, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v14, v22, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v32bf16:
@@ -31226,233 +29820,205 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX10-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX10-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v6
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GFX10-NEXT:    s_clause 0x15
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_load_ushort v36, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:80
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:84
-; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:28
-; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v20
-; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v24
-; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v30
-; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v0
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v2
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v4
-; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v6
-; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v8
-; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
-; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v12
-; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v0
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v2
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v4
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v3
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v8
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v12
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v14
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v16
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
 ; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX10-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX10-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
 ; GFX10-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
-; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX10-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX10-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX10-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GFX10-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
 ; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v16
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v18
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v28
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v26
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX10-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v20
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v22
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v24
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v26
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v30
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v7
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v17
 ; GFX10-NEXT:    v_writelane_b32 v31, s35, 3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v29
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v27
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v25
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v23
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v21
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v19
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v17
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v18
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v11
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v13
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s26, 1, v15
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v13
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v11
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v21
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v23
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v25
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v27
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v29
 ; GFX10-NEXT:    s_waitcnt vmcnt(32)
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v36
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
 ; GFX10-NEXT:    s_waitcnt vmcnt(31)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v37
-; GFX10-NEXT:    s_waitcnt vmcnt(30)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v38
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v38, v37, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v33
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v32, v33, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(29)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v34, v35, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(28)
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v48, v39, s7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v48
-; GFX10-NEXT:    s_waitcnt vmcnt(23)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v49
+; GFX10-NEXT:    v_and_b32_e32 v17, 1, v36
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v35
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
+; GFX10-NEXT:    s_waitcnt vmcnt(26)
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v37, v38, s7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v37
+; GFX10-NEXT:    s_waitcnt vmcnt(24)
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v39, v48, s8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v48
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v39
 ; GFX10-NEXT:    s_waitcnt vmcnt(22)
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v50, v49, s8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v50
-; GFX10-NEXT:    s_waitcnt vmcnt(21)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v50, v49, s9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v49
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v50
 ; GFX10-NEXT:    s_waitcnt vmcnt(20)
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, v52, v51, s9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v52
-; GFX10-NEXT:    s_waitcnt vmcnt(19)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v53
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v51, v52, s10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v52
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v51
 ; GFX10-NEXT:    s_waitcnt vmcnt(18)
-; GFX10-NEXT:    v_cndmask_b32_e64 v23, v54, v53, s10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v54
-; GFX10-NEXT:    s_waitcnt vmcnt(17)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v55
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v53, v54, s11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v54
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v53
 ; GFX10-NEXT:    s_waitcnt vmcnt(16)
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, v64, v55, s11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v64
-; GFX10-NEXT:    s_waitcnt vmcnt(15)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v65
-; GFX10-NEXT:    s_waitcnt vmcnt(14)
-; GFX10-NEXT:    v_cndmask_b32_e64 v36, v66, v65, s12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v66
-; GFX10-NEXT:    s_waitcnt vmcnt(13)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v55, v64, s12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v64
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v55
 ; GFX10-NEXT:    s_waitcnt vmcnt(12)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v68
-; GFX10-NEXT:    s_waitcnt vmcnt(11)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v69
-; GFX10-NEXT:    s_waitcnt vmcnt(10)
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v20, v69, s14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v68, v65, s13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v65
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v68
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v67
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v66
 ; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v22
-; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, v24, v22, s15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    s_waitcnt vmcnt(7)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v30
-; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, v0, v30, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v0, v2, s16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v34
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v35, v34, s17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v4, v2, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v32
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v33
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_cndmask_b32_e64 v33, v8, v33, s19
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v8, v69, s17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v69
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v10, v32, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v4, v3, s18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v10, v12, s19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v66, v12, v6, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v35, v54, s18
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v65, s20
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v64, s21
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s22
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v53, s23
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v52, s24
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v24, v51, s25
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v20, v49, s26
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, v48, v38, s27
-; GFX10-NEXT:    v_cndmask_b32_e64 v24, v37, v29, s28
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, v25, v21, s29
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s31
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v16, v13, s30
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, v19, v17, s34
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, v28, v26, s35
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v68, v67, s13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
-; GFX10-NEXT:    v_or_b32_sdwa v0, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v4, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v5, v36, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v39, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v7, v50, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v8, v22, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v9, v30, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v10, v55, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v11, v66, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v12, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v13, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v14, v34, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v15, v15, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v1, v6, s15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v14, v16, s20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v66, v67, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v15, v13, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v20, v19, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v23, v22, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v26, v25, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v29, v28, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v33, v32, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v36, v35, s26
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v39, v38, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v50, v49, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v1, v6, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v8, v54, s31
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v4, v3, s34
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v10, v12, s35
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v14, v16, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v7, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v9, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v13, v18, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v15, v21, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v19, v24, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v20, v27, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v22, v30, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v23, v34, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v25, v37, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v26, v48, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v28, v51, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v17, v52, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v29, v53, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v32, v55, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v33, v64, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v15, v16, v65, 0x5040100
 ; GFX10-NEXT:    v_readlane_b32 s35, v31, 3
 ; GFX10-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX10-NEXT:    v_readlane_b32 s31, v31, 1
@@ -31468,250 +30034,207 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x20
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:16
 ; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:24
 ; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:28
 ; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:32
 ; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:36
 ; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:40
 ; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:44
 ; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:48
 ; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:52
 ; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:56
 ; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v31
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v32
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:64
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v33
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
 ; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
 ; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v35
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v37
-; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 16, v38
-; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
 ; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 16, v39
-; GFX11-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v48
-; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
 ; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 16, v49
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v50
-; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
 ; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v51
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v53
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v54
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v55
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 16, v64
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v55, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    v_dual_cndmask_b32 v16, v64, v65 :: v_dual_and_b32 v11, 1, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, v66, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, v68, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, v70, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, v80, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v83
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v83, v84, v83, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, v82, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v85
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 16, v65
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v85, v86, v85, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v66
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v67
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v82, v81, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, v84, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v68
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 16, v69
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v80, v71, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 16, v70
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v71
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v70, v69, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v80
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v81
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v68, v67, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 16, v82
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v66, v65, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v87, 1, v87
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v64, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v35, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v33, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v31, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v87
-; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v85
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v86, v30, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v83
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v84, v28, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v147, v146 :: v_dual_lshlrev_b32 v28, 16, v28
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, v86, v87, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
+; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v145, v144, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v135, v134, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v36, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v133, v132, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v131, v130, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v38, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v48, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v129, v128, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v119, v118, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v50, v51, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v52, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v117, v116, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v115, v114, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v54, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v64, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v113, v112, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v103, v102, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, v66, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v68, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v101, v100, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v99, v98, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, v70, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, v80, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v27, v82, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX11-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, v84, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v97, v96, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v11, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v12, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v13, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v14, v29, v28
-; GFX11-NEXT:    v_or_b32_e32 v15, v31, v30
+; GFX11-NEXT:    v_perm_b32 v14, v29, v28, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v31, v86, v87, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v15, v31, v30, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
@@ -31746,42 +30269,42 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-LABEL: v_fma_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %op
@@ -31821,59 +30344,59 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX8-LABEL: v_fma_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_fma_f32 v3, v5, v4, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v4
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v2, v3, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
-; GFX11-NEXT:    v_perm_b32 v0, v2, v3, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   ret <2 x bfloat> %op
@@ -31923,80 +30446,97 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-LABEL: v_fma_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_fma_f32 v3, v6, v5, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT:    v_fma_f32 v2, v6, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_fma_f32 v3, v6, v5, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_fma_f32 v2, v6, v4, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX11-NEXT:    v_dual_fmac_f32 v4, v0, v2 :: v_dual_fmac_f32 v5, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v5, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_fmac_f32 v6, v8, v7
+; GFX11-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
@@ -32056,100 +30596,97 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-LABEL: v_fma_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
-; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT:    v_fma_f32 v2, v6, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_fma_f32 v2, v6, v4, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_perm_b32 v0, v4, v9, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_and_b32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX11-NEXT:    v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v4, v0, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v7, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v5, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_fmac_f32 v6, v8, v7
+; GFX11-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
@@ -32188,50 +30725,50 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-LABEL: v_fmuladd_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %op
@@ -32275,73 +30812,73 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+;
+; GFX8-LABEL: v_fmuladd_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_and_b32 v1, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
@@ -32349,7 +30886,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   ret <2 x bfloat> %op
@@ -32411,108 +30948,132 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-LABEL: v_fmuladd_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v6
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v7, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v7 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
@@ -32588,135 +31149,132 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-LABEL: v_fmuladd_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX10-NEXT:    v_add_f32_e32 v2, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_mul_f32 v6, v7, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v2, v6, v9 :: v_dual_mul_f32 v3, v8, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v7, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v7 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index e62dfe100642b..9d21bf0fea716 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -1119,10 +1119,9 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_bfi_b32 v2, s4, v0, v1
+; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_bfi_b32 v2, s4, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -1133,9 +1132,8 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index e9bf515daabca..a69fb35f8f0cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -787,7 +787,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
@@ -797,7 +797,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -1021,6 +1021,9 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
 ; GFX8-LABEL: fmed3_f32_fpext_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1028,6 +1031,9 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
 ; GFX9-LABEL: fmed3_f32_fpext_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1053,6 +1059,7 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1062,6 +1069,7 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1087,6 +1095,7 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1096,6 +1105,7 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1121,6 +1131,7 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1130,6 +1141,7 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 62e7a232d2b25..9a8ddb5bd3831 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1243,17 +1243,17 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
 ; GFX9-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v2, v2, v3, s4
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v3, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 3a6ecc2149489..5f2a8c4ba9824 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1875,7 +1875,7 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v0, off
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1883,7 +1883,7 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %arg0, ptr addrspace(1) undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index f24cc6f177d62..ef2be37d62d9d 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2818,13 +2818,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2833,18 +2833,17 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT:    v_and_b32_e32 v0, 1, v16
+; VI-NEXT:    v_and_b32_e32 v0, 1, v20
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v20, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_i1_i8_i16_bf16:
@@ -2860,13 +2859,14 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2876,33 +2876,32 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v20, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_u8 v33, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:20
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -2911,9 +2910,8 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
 ; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2924,8 +2922,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
@@ -2933,8 +2934,6 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b16 v36, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v32, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -3166,35 +3165,29 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v13, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v16, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v12, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v20, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
@@ -3212,55 +3205,45 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v13, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v16, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v12, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v20, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v33
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -3277,6 +3260,12 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3286,14 +3275,6 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v38, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v37, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v32, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i16> %arg1, ptr addrspace(1) undef
@@ -4656,20 +4637,28 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 
 
 define void @void_func_bf16(bfloat %arg0) #0 {
-; CIGFX89-LABEL: void_func_bf16:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+; CI-LABEL: void_func_bf16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_bf16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index d0a8e53905652..acadee2798171 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -2365,29 +2365,21 @@ define bfloat @bf16_func_void() #0 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: bf16_func_void:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bf16_func_void:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    global_load_short_d16_hi v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX89-LABEL: bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: bf16_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) undef
@@ -2440,28 +2432,14 @@ define <3 x bfloat> @v3bf16_func_void() #0 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v3bf16_func_void:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v3bf16_func_void:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v2, v2, 0, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX89-LABEL: v3bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v3bf16_func_void:
 ; GFX11:       ; %bb.0:
@@ -2470,11 +2448,6 @@ define <3 x bfloat> @v3bf16_func_void() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v2, 0xffff, 0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x bfloat>, ptr addrspace(1) undef
   ret <3 x bfloat> %val
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index bdaa224439c53..145ab4ae6378b 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -16599,7 +16599,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -16624,7 +16623,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -16652,7 +16650,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -16681,7 +16678,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
@@ -16943,7 +16939,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -16970,7 +16965,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -16998,7 +16992,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -17027,7 +17020,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
@@ -17401,19 +17393,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -17431,19 +17420,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -17461,20 +17447,17 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -17492,19 +17475,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -17755,19 +17735,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX9-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -17785,19 +17762,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 0
-; GFX10-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -17815,20 +17789,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 0
-; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -17846,19 +17817,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 0
-; GFX10-SCRATCH-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 3f4b039a976ed..f3c31d562e8af 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -613,11 +613,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX11-NEXT: {{  $}}
   ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
-  ; DAGISEL-GFX11-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; DAGISEL-GFX11-NEXT:   [[S_PACK_LH_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LH_B32_B16 killed [[S_MOV_B32_1]], [[COPY1]]
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_PACK_LH_B32_B16_]], 0, killed [[V_AND_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX11-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -629,11 +627,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX10-NEXT: {{  $}}
   ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
-  ; DAGISEL-GFX10-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; DAGISEL-GFX10-NEXT:   [[S_PACK_LH_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LH_B32_B16 killed [[S_MOV_B32_1]], [[COPY1]]
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_PACK_LH_B32_B16_]], 0, killed [[V_AND_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX10-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX10-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index c8570d6f279a6..c629432c8bec0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -5731,19 +5731,19 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_exp_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-NEXT:    v_sub_f32_e32 v4, v0, v1
-; VI-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
-; VI-NEXT:    v_mul_f32_e32 v5, 0x39a3b295, v4
-; VI-NEXT:    v_mul_f32_e32 v4, 0x3fb8a000, v4
-; VI-NEXT:    v_rndne_f32_e32 v3, v2
-; VI-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-NEXT:    v_mul_f32_e32 v1, 0x39a3b295, v1
-; VI-NEXT:    v_sub_f32_e32 v2, v2, v3
-; VI-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    v_sub_f32_e32 v3, v0, v0
+; VI-NEXT:    v_mul_f32_e32 v1, 0x3fb8a000, v0
+; VI-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v3
+; VI-NEXT:    v_mul_f32_e32 v3, 0x3fb8a000, v3
+; VI-NEXT:    v_rndne_f32_e32 v2, v1
+; VI-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v0
+; VI-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_add_f32_e32 v1, v1, v3
 ; VI-NEXT:    v_exp_f32_e32 v1, v1
-; VI-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; VI-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; VI-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x42b17218
@@ -5757,6 +5757,7 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-LABEL: v_exp_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-NEXT:    v_rndne_f32_e32 v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index eee254398aefc..09f51ee32c715 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -5809,19 +5809,19 @@ define float @v_exp10_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_exp10_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-NEXT:    v_sub_f32_e32 v4, v0, v1
-; VI-NEXT:    v_mul_f32_e32 v2, 0x40549000, v1
-; VI-NEXT:    v_mul_f32_e32 v5, 0x3a2784bc, v4
-; VI-NEXT:    v_mul_f32_e32 v4, 0x40549000, v4
-; VI-NEXT:    v_rndne_f32_e32 v3, v2
-; VI-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v1
-; VI-NEXT:    v_sub_f32_e32 v2, v2, v3
-; VI-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    v_sub_f32_e32 v3, v0, v0
+; VI-NEXT:    v_mul_f32_e32 v1, 0x40549000, v0
+; VI-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v3
+; VI-NEXT:    v_mul_f32_e32 v3, 0x40549000, v3
+; VI-NEXT:    v_rndne_f32_e32 v2, v1
+; VI-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v0
+; VI-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_add_f32_e32 v1, v1, v3
 ; VI-NEXT:    v_exp_f32_e32 v1, v1
-; VI-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; VI-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; VI-NEXT:    s_mov_b32 s4, 0xc23369f4
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x421a209b
@@ -5835,6 +5835,7 @@ define float @v_exp10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-LABEL: v_exp10_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x40549a78
 ; GFX900-NEXT:    v_rndne_f32_e32 v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 587109b9431a1..b6cea034b9de7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1968,19 +1968,49 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 }
 
 define float @v_exp2_f32_from_fpext_bf16(bfloat %src) {
-; GCN-LABEL: v_exp2_f32_from_fpext_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp2_f32_from_fpext_bf16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-NEXT:    v_add_f32_e32 v0, v0, v2
+; SI-NEXT:    v_exp_f32_e32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; SI-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_exp2_f32_from_fpext_bf16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; VI-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-NEXT:    v_exp_f32_e32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; VI-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_exp2_f32_from_fpext_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_from_fpext_bf16:
 ; R600:       ; %bb.0:
@@ -2936,5 +2966,3 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2
 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
 attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index 7723a724c6086..ca3336beccf77 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -173,8 +173,7 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: snan_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -186,8 +185,7 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: snan_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -199,8 +197,7 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: snan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
@@ -210,7 +207,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: snan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
@@ -234,8 +230,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: qnan_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -244,8 +239,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: qnan_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -254,8 +248,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: qnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -263,7 +256,6 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: qnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -285,7 +277,6 @@ define i1 @posinf_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: posinf_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -295,22 +286,20 @@ define i1 @posinf_bf16(bfloat %x) nounwind {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posinf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posinf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -331,7 +320,6 @@ define i1 @neginf_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: neginf_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0xff80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -341,22 +329,20 @@ define i1 @neginf_bf16(bfloat %x) nounwind {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0xff80
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: neginf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0xffffff80
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: neginf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -382,7 +368,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: posnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -395,7 +380,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: posnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -408,7 +392,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: posnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -420,7 +403,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: posnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -450,7 +432,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: negnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -463,7 +444,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: negnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -476,7 +456,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: negnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -488,7 +467,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: negnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -515,8 +493,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: possubnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8CHECK-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -525,8 +502,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: possubnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9CHECK-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -535,7 +511,6 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: possubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -544,7 +519,6 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: possubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -570,7 +544,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: negsubnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
@@ -583,7 +556,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: negsubnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
@@ -596,7 +568,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: negsubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, -1
@@ -608,7 +579,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: negsubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, -1
@@ -632,7 +602,6 @@ define i1 @poszero_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: poszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -640,23 +609,20 @@ define i1 @poszero_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: poszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: poszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: poszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -677,7 +643,6 @@ define i1 @negzero_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: negzero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -687,22 +652,20 @@ define i1 @negzero_bf16(bfloat %x) nounwind {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negzero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0xffff8000
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negzero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -723,7 +686,6 @@ define i1 @posfinite_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: posfinite_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -733,22 +695,20 @@ define i1 @posfinite_bf16(bfloat %x) nounwind {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_lt_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posfinite_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX10CHECK-NEXT:    v_cmp_lt_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posfinite_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -772,7 +732,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: negfinite_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -784,7 +743,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: negfinite_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -796,7 +754,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: negfinite_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v1
@@ -807,7 +764,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: negfinite_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v1
@@ -831,8 +787,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: isnan_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -841,8 +796,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: isnan_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -851,8 +805,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: isnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -860,7 +813,6 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: isnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -882,8 +834,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isnan_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -892,8 +843,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isnan_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -902,8 +852,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -911,7 +860,6 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1172,8 +1120,7 @@ define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
 ; GFX8CHECK-LABEL: isnan_bf16_strictfp:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1182,8 +1129,7 @@ define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
 ; GFX9CHECK-LABEL: isnan_bf16_strictfp:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1192,8 +1138,7 @@ define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
 ; GFX10CHECK-LABEL: isnan_bf16_strictfp:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1201,7 +1146,6 @@ define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
 ; GFX11CHECK-LABEL: isnan_bf16_strictfp:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1223,8 +1167,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: isinf_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1233,8 +1176,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: isinf_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1243,8 +1185,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: isinf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1252,7 +1193,6 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: isinf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1274,8 +1214,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: isfinite_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1284,8 +1223,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: isfinite_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1294,8 +1232,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: isfinite_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1303,7 +1240,6 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: isfinite_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1325,8 +1261,7 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1334,8 +1269,7 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1343,8 +1277,7 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1352,7 +1285,6 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1375,8 +1307,7 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1384,8 +1315,7 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1393,8 +1323,7 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1402,7 +1331,6 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1427,8 +1355,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: isnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f00
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
@@ -1438,8 +1365,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: isnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f00
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
@@ -1449,8 +1375,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: isnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1459,7 +1384,6 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: isnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
@@ -1484,8 +1408,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7eff
 ; GFX8CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
@@ -1495,8 +1418,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7eff
 ; GFX9CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
@@ -1506,8 +1428,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1516,7 +1437,6 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
@@ -1544,7 +1464,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_is_plus_normal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -1557,7 +1476,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_is_plus_normal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -1570,7 +1488,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_is_plus_normal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -1582,7 +1499,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_is_plus_normal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -1612,7 +1528,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_is_neg_normal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -1625,7 +1540,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_is_neg_normal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -1638,7 +1552,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_is_neg_normal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -1650,7 +1563,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_is_neg_normal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -1676,8 +1588,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: issubnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
@@ -1687,8 +1598,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: issubnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
@@ -1698,8 +1608,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: issubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1708,7 +1617,6 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: issubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
@@ -1732,8 +1640,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_issubnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7e
 ; GFX8CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
@@ -1743,8 +1650,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_issubnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7e
 ; GFX9CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
@@ -1754,8 +1660,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_issubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1764,7 +1669,6 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_issubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
@@ -1786,8 +1690,7 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: iszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1795,8 +1698,7 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: iszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1804,8 +1706,7 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: iszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1813,7 +1714,6 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: iszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1834,8 +1734,7 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_iszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1843,8 +1742,7 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_iszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1852,8 +1750,7 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_iszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1861,7 +1758,6 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_iszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1883,7 +1779,6 @@ define i1 @ispositive_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: ispositive_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1893,22 +1788,20 @@ define i1 @ispositive_bf16(bfloat %x) {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
-; GFX9CHECK-NEXT:    v_cmp_lt_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: ispositive_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f81
-; GFX10CHECK-NEXT:    v_cmp_lt_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: ispositive_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1938,7 +1831,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_ispositive_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s6, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
@@ -1955,7 +1847,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_ispositive_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
@@ -1972,7 +1863,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_ispositive_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0xff80, v0
@@ -1987,7 +1877,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_ispositive_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0xff80, v0
@@ -2022,7 +1911,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: isnegative_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
@@ -2037,7 +1925,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: isnegative_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
@@ -2052,7 +1939,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: isnegative_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0xff80, v0
@@ -2065,7 +1951,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: isnegative_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0xff80, v0
@@ -2095,7 +1980,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isnegative_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2108,7 +1992,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isnegative_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2121,7 +2004,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isnegative_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v1
@@ -2132,7 +2014,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isnegative_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v1
@@ -2158,8 +2039,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: iszero_or_nan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2170,8 +2050,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: iszero_or_nan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2182,8 +2061,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: iszero_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
@@ -2193,7 +2071,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: iszero_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
@@ -2220,8 +2097,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX8CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2232,8 +2108,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX9CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2244,8 +2119,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX10CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
@@ -2255,7 +2129,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX11CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
@@ -2282,8 +2155,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2294,8 +2166,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2306,8 +2177,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
@@ -2317,7 +2187,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
@@ -2344,8 +2213,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_iszero_or_nan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2356,8 +2224,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_iszero_or_nan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2368,8 +2235,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_iszero_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
@@ -2379,7 +2245,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_iszero_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
@@ -2406,8 +2271,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2418,8 +2282,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2430,8 +2293,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
@@ -2441,7 +2303,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
@@ -2468,8 +2329,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2480,8 +2340,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2492,8 +2351,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
@@ -2503,7 +2361,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
@@ -2530,8 +2387,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: iszero_or_qnan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2542,8 +2398,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: iszero_or_qnan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2554,8 +2409,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: iszero_or_qnan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
@@ -2565,7 +2419,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: iszero_or_qnan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
@@ -2595,8 +2448,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: iszero_or_snan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -2610,8 +2462,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: iszero_or_snan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -2625,8 +2476,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: iszero_or_snan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
@@ -2638,7 +2488,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: iszero_or_snan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
@@ -2679,8 +2528,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_iszero_or_qnan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX8CHECK-NEXT:    s_movk_i32 s8, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
@@ -2702,8 +2550,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_iszero_or_qnan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX9CHECK-NEXT:    s_movk_i32 s8, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
@@ -2725,8 +2572,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_iszero_or_qnan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
@@ -2744,7 +2590,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_iszero_or_qnan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
@@ -2789,8 +2634,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_iszero_or_snan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v1, -1, v0
@@ -2810,8 +2654,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_iszero_or_snan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v1, -1, v0
@@ -2831,8 +2674,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_iszero_or_snan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v2, v0, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
@@ -2848,7 +2690,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_iszero_or_snan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v2, v0, 0xff80
@@ -2879,8 +2720,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2889,8 +2729,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2899,8 +2738,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -2908,7 +2746,6 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -2931,8 +2768,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2941,8 +2777,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2951,8 +2786,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -2960,7 +2794,6 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -2983,8 +2816,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX8CHECK-LABEL: isfinite_or_nan_f:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2993,8 +2825,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX9CHECK-LABEL: isfinite_or_nan_f:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3003,8 +2834,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX10CHECK-LABEL: isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -3012,7 +2842,6 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX11CHECK-LABEL: isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -3035,8 +2864,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3045,8 +2873,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3055,8 +2882,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -3064,7 +2890,6 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -3090,4 +2915,3 @@ attributes #1 = { "denormal-fp-math"="ieee,dynamic" }
 ; GFX7SELDAG: {{.*}}
 ; GFX8SELDAG: {{.*}}
 ; GFX9SELDAG: {{.*}}
-; GFX7GLISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index a0b2d3b32b795..1d5d39c6c0ba9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6234,6 +6234,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_log_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6260,6 +6261,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-LABEL: v_log_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6283,22 +6285,23 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-LABEL: v_log_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 5ba72612321a6..dc78886a8c108 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6234,6 +6234,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_log10_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6260,6 +6261,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-LABEL: v_log10_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6283,22 +6285,23 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-LABEL: v_log10_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 6ccef4c02ab3b..694acb42e6b03 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -2727,28 +2727,60 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 }
 
 define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
-; GFX689-LABEL: v_log2_f32_from_fpext_bf16:
-; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-NEXT:    s_mov_b32 s4, 0x800000
-; GFX689-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX689-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GFX689-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX689-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_log2_f32_from_fpext_bf16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x800000
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; SI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SI-NEXT:    v_log_f32_e32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_log2_f32_from_fpext_bf16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_mov_b32 s4, 0x800000
+; VI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-NEXT:    v_log_f32_e32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x800000
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -3973,5 +4005,3 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2
 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
 attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 5296ef1f88678..7367385351b99 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1422,7 +1422,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    s_cbranch_execnz .LBB10_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_lshrrev_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fadd_ret_bf16:
@@ -1452,7 +1452,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 5b9866a3c9157..8ca33dce73f88 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -303,8 +303,9 @@ ret:
   ret void
 }
 
+; FIXME: This shouldn't have the 0 initialization
 ; GCN-LABEL: {{^}}undef_v3bf16:
-; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; GCN: s_cbranch_vccnz
 define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 844aa57de05ce..dfe98bbbaddf9 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -2677,10 +2677,7 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_23uu:
@@ -2688,8 +2685,6 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_23uu:
@@ -2697,9 +2692,6 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2713,13 +2705,10 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_234u:
@@ -2728,11 +2717,9 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_234u:
@@ -2740,13 +2727,7 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2786,12 +2767,8 @@ define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_u3u1:
@@ -2799,10 +2776,7 @@ define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_u3u1:
@@ -2810,11 +2784,7 @@ define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2827,10 +2797,7 @@ define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_u3uu:
@@ -2838,8 +2805,6 @@ define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_u3uu:
@@ -2847,9 +2812,6 @@ define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2864,7 +2826,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2875,7 +2837,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2886,7 +2848,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -2902,7 +2864,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2913,7 +2875,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2924,7 +2886,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -2937,29 +2899,22 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_35u5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s5, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_35u5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v2
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_35u5:
@@ -2968,13 +2923,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2988,13 +2937,10 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_357u:
@@ -3003,10 +2949,8 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x3020706
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_357u:
@@ -3015,11 +2959,8 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3032,10 +2973,7 @@ define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3044,8 +2982,6 @@ define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3054,9 +2990,6 @@ define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3098,13 +3031,10 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0145:
@@ -3113,11 +3043,9 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0145:
@@ -3125,13 +3053,7 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3143,26 +3065,23 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0167:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0167:
@@ -3170,10 +3089,6 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3187,12 +3102,8 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2301:
@@ -3200,10 +3111,7 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2301:
@@ -3211,11 +3119,7 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3228,10 +3132,7 @@ define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3240,8 +3141,6 @@ define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3250,9 +3149,6 @@ define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3267,13 +3163,10 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2345:
@@ -3282,11 +3175,9 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2345:
@@ -3294,13 +3185,7 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3312,26 +3197,23 @@ define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_2367:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2367:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2367:
@@ -3339,10 +3221,6 @@ define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3357,13 +3235,10 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4501:
@@ -3372,11 +3247,9 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4501:
@@ -3385,12 +3258,8 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3402,26 +3271,23 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_4523:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4523:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4523:
@@ -3430,9 +3296,7 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3446,10 +3310,7 @@ define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3458,8 +3319,6 @@ define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3468,9 +3327,6 @@ define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3512,13 +3368,10 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6701:
@@ -3527,11 +3380,9 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6701:
@@ -3540,12 +3391,8 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3557,26 +3404,23 @@ define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_6723:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6723:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6723:
@@ -3585,9 +3429,7 @@ define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3601,12 +3443,8 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6745:
@@ -3614,10 +3452,7 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6745:
@@ -3625,11 +3460,7 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3642,10 +3473,7 @@ define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3654,8 +3482,6 @@ define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3664,9 +3490,6 @@ define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3679,43 +3502,33 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_2356:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
-; GFX9-NEXT:    v_and_or_b32 v0, v6, s4, v0
-; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2356:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
+; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v6, v0
-; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2356:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3729,10 +3542,8 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3746,7 +3557,6 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_5623:
@@ -3756,8 +3566,6 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3770,17 +3578,11 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_alignbit_b32 v2, v4, v6, 16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s5
-; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v3
+; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_3456:
@@ -3790,10 +3592,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
-; GFX10-NEXT:    v_alignbit_b32 v2, v5, v4, 16
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
-; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x3020706
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_3456:
@@ -3803,12 +3602,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3822,43 +3616,32 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s5
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_5634:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_5634:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
-; GFX11-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3872,43 +3655,33 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_5734:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_5734:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
-; GFX11-NEXT:    v_perm_b32 v1, v0, v1, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3921,10 +3694,9 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3933,8 +3705,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3943,9 +3714,8 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3959,11 +3729,8 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3973,8 +3740,6 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3984,9 +3749,6 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4001,16 +3763,11 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_mov_b32 s5, 0x7060706
-; GFX9-NEXT:    s_mov_b32 s6, 0x3020706
-; GFX9-NEXT:    s_mov_b32 s7, 0x3020504
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_perm_b32 v2, v1, v1, s5
-; GFX9-NEXT:    v_and_or_b32 v3, v1, s4, v0
-; GFX9-NEXT:    v_perm_b32 v0, v1, v2, s6
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_1100:
@@ -4018,11 +3775,8 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v1, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
-; GFX10-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_1100:
@@ -4030,13 +3784,8 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
-; GFX11-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4051,10 +3800,8 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4063,10 +3810,8 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4075,11 +3820,9 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -4093,14 +3836,9 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2333:
@@ -4108,11 +3846,7 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2333:
@@ -4120,13 +3854,7 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4139,14 +3867,9 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6667:
@@ -4154,11 +3877,7 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6667:
@@ -4166,13 +3885,7 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4185,10 +3898,7 @@ define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4197,8 +3907,6 @@ define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4207,9 +3915,6 @@ define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
@@ -4251,13 +3956,10 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_4589:
@@ -4266,11 +3968,9 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_4589:
@@ -4278,13 +3978,7 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
@@ -4296,26 +3990,23 @@ define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX9-LABEL: shuffle_v8bf16_10_11_2_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_10_11_2_3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_10_11_2_3:
@@ -4324,9 +4015,7 @@ define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
@@ -4341,10 +4030,8 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4358,7 +4045,6 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_13_14_2_3:
@@ -4368,8 +4054,6 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
@@ -4383,10 +4067,9 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3bf16_0122:
@@ -4394,8 +4077,7 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3bf16_0122:
@@ -4403,9 +4085,7 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1
@@ -4418,11 +4098,8 @@ define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v0, v0, 16
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2bf16_0122:
@@ -4431,8 +4108,6 @@ define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2bf16_0122:
@@ -4441,9 +4116,6 @@ define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1
@@ -4507,38 +4179,38 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
-; GFX9-NEXT:    s_mov_b32 s0, 0x3020706
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s0, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_fma_f32 v7, v9, v8, v7
-; GFX9-NEXT:    v_fma_f32 v0, v9, v2, v0
-; GFX9-NEXT:    v_fma_f32 v8, v11, v8, v12
-; GFX9-NEXT:    v_fma_f32 v1, v11, v2, v1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_fma_f32 v7, v8, v9, v7
+; GFX9-NEXT:    v_fma_f32 v0, v8, v4, v0
+; GFX9-NEXT:    v_fma_f32 v4, v11, v4, v12
+; GFX9-NEXT:    v_fma_f32 v1, v11, v9, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX9-NEXT:    v_fma_f32 v0, v4, v10, v0
-; GFX9-NEXT:    v_fma_f32 v2, v4, v3, v2
-; GFX9-NEXT:    v_fma_f32 v1, v5, v10, v1
-; GFX9-NEXT:    v_fma_f32 v3, v5, v3, v7
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_fma_f32 v0, v2, v10, v0
+; GFX9-NEXT:    v_fma_f32 v2, v2, v5, v7
+; GFX9-NEXT:    v_fma_f32 v1, v3, v5, v1
+; GFX9-NEXT:    v_fma_f32 v3, v3, v10, v4
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s0
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4551,37 +4223,37 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
-; GFX10-NEXT:    v_fmac_f32_e32 v0, v9, v2
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v2
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v8
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v8, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v0, v4, v10
-; GFX10-NEXT:    v_fmac_f32_e32 v5, v4, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v10
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v3
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v2, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v10
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v7, v1, 0x7060302
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -4594,37 +4266,39 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[0:1]
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_fmac_f32_e32 v1, v11, v8
-; GFX11-NEXT:    v_dual_fmac_f32 v12, v11, v2 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v11, v4 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v3, v10 :: v_dual_fmac_f32 v0, v8, v4
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v0, v9, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v2, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v10
+; GFX11-NEXT:    v_fmac_f32_e32 v12, v11, v9
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v9
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v0, v4, v10
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_fmac_f32 v4, v2, v5 :: v_dual_and_b32 v7, 0xffff0000, v12
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v4, v3
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v2, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x7060302
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4662,40 +4336,32 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
-; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
-; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x1000504
-; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x5040100
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x1000504
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4709,9 +4375,9 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: low16bits:
@@ -4720,7 +4386,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x1000504
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: low16bits:
@@ -4729,7 +4395,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x1000504
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
@@ -4745,9 +4411,9 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: hi16bits_v2bf16:
@@ -4756,7 +4422,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: hi16bits_v2bf16:
@@ -4765,7 +4431,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
@@ -4779,29 +4445,29 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1)
 ; GFX9-LABEL: low16hi16bits_v2bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020504
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: low16hi16bits_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020504
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: low16hi16bits_v2bf16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020504
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4

From 2aff7f39199f214ee606e4e18a8eba178ffe0d94 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Thu, 4 Jan 2024 09:33:09 -0600
Subject: [PATCH 242/313] [mlir][LLVM] Add !invariant.load metadata support to
 llvm.load (#76754)

Add support for !invariant.load metadata (by way of a unit attribute) to
the MLIR representation of llvm.load.
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td    |  9 ++++++++-
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp     |  6 +++---
 mlir/test/Dialect/LLVMIR/roundtrip.mlir        |  7 +++++++
 mlir/test/Target/LLVMIR/Import/instructions.ll | 13 +++++++++++++
 mlir/test/Target/LLVMIR/llvmir.mlir            | 11 +++++++++++
 5 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 9e65898154bd6..2376f1f0e2dc7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -330,6 +330,7 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
               OptionalAttr<I64Attr>:$alignment,
               UnitAttr:$volatile_,
               UnitAttr:$nontemporal,
+              UnitAttr:$invariant,
               DefaultValuedAttr<
                 AtomicOrdering, "AtomicOrdering::not_atomic">:$ordering,
               OptionalAttr<StrAttr>:$syncscope);
@@ -364,11 +365,16 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
   let assemblyFormat = [{
     (`volatile` $volatile_^)? $addr
     (`atomic` (`syncscope` `(` $syncscope^ `)`)? $ordering^)?
+    (`invariant` $invariant^)?
     attr-dict `:` qualified(type($addr)) `->` type($res)
   }];
   string llvmBuilder = [{
     auto *inst = builder.CreateLoad($_resultType, $addr, $volatile_);
     $res = inst;
+    if ($invariant) {
+      llvm::MDNode *metadata = llvm::MDNode::get(inst->getContext(), std::nullopt);
+      inst->setMetadata(llvm::LLVMContext::MD_invariant_load, metadata);
+    }
   }] # setOrderingCode
      # setSyncScopeCode
      # setAlignmentCode
@@ -381,13 +387,14 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
     $res = $_builder.create<LLVM::LoadOp>($_location, $_resultType, $addr,
         alignment, loadInst->isVolatile(),
         loadInst->hasMetadata(llvm::LLVMContext::MD_nontemporal),
+        loadInst->hasMetadata(llvm::LLVMContext::MD_invariant_load),
         convertAtomicOrderingFromLLVM(loadInst->getOrdering()),
         getLLVMSyncScope(loadInst));
   }];
   let builders = [
     OpBuilder<(ins "Type":$type, "Value":$addr,
       CArg<"unsigned", "0">:$alignment, CArg<"bool", "false">:$isVolatile,
-      CArg<"bool", "false">:$isNonTemporal,
+      CArg<"bool", "false">:$isNonTemporal, CArg<"bool", "false">:$isInvariant,
       CArg<"AtomicOrdering", "AtomicOrdering::not_atomic">:$ordering,
       CArg<"StringRef", "StringRef()">:$syncscope)>
   ];
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 64388a9a01812..c2336f44b33e3 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -822,11 +822,11 @@ LogicalResult LoadOp::verify() {
 
 void LoadOp::build(OpBuilder &builder, OperationState &state, Type type,
                    Value addr, unsigned alignment, bool isVolatile,
-                   bool isNonTemporal, AtomicOrdering ordering,
-                   StringRef syncscope) {
+                   bool isNonTemporal, bool isInvariant,
+                   AtomicOrdering ordering, StringRef syncscope) {
   build(builder, state, type, addr,
         alignment ? builder.getI64IntegerAttr(alignment) : nullptr, isVolatile,
-        isNonTemporal, ordering,
+        isNonTemporal, isInvariant, ordering,
         syncscope.empty() ? nullptr : builder.getStringAttr(syncscope),
         /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr,
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 49f34785ebad5..1958dd56bab7a 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -401,6 +401,13 @@ func.func @cmpxchg(%ptr : !llvm.ptr, %cmp : i32, %new : i32) {
   llvm.return
 }
 
+// CHECK-LABEL: @invariant_load
+func.func @invariant_load(%ptr : !llvm.ptr) -> i32 {
+  // CHECK: llvm.load %{{.+}} invariant {alignment = 4 : i64} : !llvm.ptr -> i32
+  %0 = llvm.load %ptr invariant {alignment = 4 : i64} : !llvm.ptr -> i32
+  func.return %0 : i32
+}
+
 llvm.mlir.global external constant @_ZTIi() : !llvm.ptr
 llvm.func @bar(!llvm.ptr, !llvm.ptr, !llvm.ptr)
 llvm.func @__gxx_personality_v0(...) -> i32
diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll
index 036efad0d099a..005aafb20a510 100644
--- a/mlir/test/Target/LLVMIR/Import/instructions.ll
+++ b/mlir/test/Target/LLVMIR/Import/instructions.ll
@@ -370,6 +370,19 @@ define void @load_store(ptr %ptr) {
 
 ; // -----
 
+; CHECK-LABEL: @invariant_load
+; CHECK-SAME:  %[[PTR:[a-zA-Z0-9]+]]
+define float @invariant_load(ptr %ptr) {
+  ; CHECK:  %[[V:[0-9]+]] = llvm.load %[[PTR]] invariant {alignment = 4 : i64} : !llvm.ptr -> f32
+  %1 = load float, ptr %ptr, align 4, !invariant.load !0
+  ; CHECK:  llvm.return %[[V]]
+  ret float %1
+}
+
+!0 = !{}
+
+; // -----
+
 ; CHECK-LABEL: @atomic_load_store
 ; CHECK-SAME:  %[[PTR:[a-zA-Z0-9]+]]
 define void @atomic_load_store(ptr %ptr) {
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 13e61b6ce10b2..4a036b0497fff 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -1911,6 +1911,17 @@ llvm.func @nontemporal_store_and_load() {
 
 // -----
 
+// Check that invariantLoad attribute is exported as metadata node.
+llvm.func @nontemporal_store_and_load(%ptr : !llvm.ptr) -> i32 {
+  // CHECK: !invariant.load ![[NODE:[0-9]+]]
+  %1 = llvm.load %ptr invariant : !llvm.ptr -> i32
+  llvm.return %1 : i32
+}
+
+// CHECK: ![[NODE]] = !{}
+
+// -----
+
 llvm.func @atomic_store_and_load(%ptr : !llvm.ptr) {
   // CHECK: load atomic
   // CHECK-SAME:  acquire, align 4

From b336ab42dcc81a351b2f875f28c70b74d8814611 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Thu, 4 Jan 2024 16:40:13 +0100
Subject: [PATCH 243/313] [mlir] add a way to query non-property attributes
 (#76959)

This helps support generic manipulation of operations that don't (yet)
use properties to store inherent attributes.

Use this mechanism in type inference and operation equivalence.

Note that only minimal unit tests are introduced as all the upstream
dialects seem to have been updated to use properties and the
non-property behavior is essentially deprecated and untested.
---
 mlir/include/mlir/IR/Operation.h             |  3 +++
 mlir/lib/IR/OperationSupport.cpp             |  9 ++++-----
 mlir/lib/Interfaces/InferTypeOpInterface.cpp |  5 ++---
 mlir/unittests/Bytecode/BytecodeTest.cpp     |  3 +++
 mlir/unittests/IR/OpPropertiesTest.cpp       | 10 ++++++++++
 5 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
index d2f52cf1afee8..3ffd3517fe5a6 100644
--- a/mlir/include/mlir/IR/Operation.h
+++ b/mlir/include/mlir/IR/Operation.h
@@ -500,6 +500,9 @@ class alignas(8) Operation final
                                llvm::to_vector(getDiscardableAttrs()));
   }
 
+  /// Return all attributes that are not stored as properties.
+  DictionaryAttr getRawDictionaryAttrs() { return attrs; }
+
   /// Return all of the attributes on this operation.
   ArrayRef<NamedAttribute> getAttrs() { return getAttrDictionary().getValue(); }
 
diff --git a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp
index eaad6f2891608..e10cd748e03ba 100644
--- a/mlir/lib/IR/OperationSupport.cpp
+++ b/mlir/lib/IR/OperationSupport.cpp
@@ -675,7 +675,7 @@ llvm::hash_code OperationEquivalence::computeHash(
   //   - Attributes
   //   - Result Types
   llvm::hash_code hash =
-      llvm::hash_combine(op->getName(), op->getDiscardableAttrDictionary(),
+      llvm::hash_combine(op->getName(), op->getRawDictionaryAttrs(),
                          op->getResultTypes(), op->hashProperties());
 
   //   - Location if required
@@ -831,14 +831,13 @@ OperationEquivalence::isRegionEquivalentTo(Region *lhs, Region *rhs,
 
   // 1. Compare the operation properties.
   if (lhs->getName() != rhs->getName() ||
-      lhs->getDiscardableAttrDictionary() !=
-          rhs->getDiscardableAttrDictionary() ||
+      lhs->getRawDictionaryAttrs() != rhs->getRawDictionaryAttrs() ||
       lhs->getNumRegions() != rhs->getNumRegions() ||
       lhs->getNumSuccessors() != rhs->getNumSuccessors() ||
       lhs->getNumOperands() != rhs->getNumOperands() ||
       lhs->getNumResults() != rhs->getNumResults() ||
       !lhs->getName().compareOpProperties(lhs->getPropertiesStorage(),
-                                        rhs->getPropertiesStorage()))
+                                          rhs->getPropertiesStorage()))
     return false;
   if (!(flags & IgnoreLocations) && lhs->getLoc() != rhs->getLoc())
     return false;
@@ -923,7 +922,7 @@ OperationFingerPrint::OperationFingerPrint(Operation *topOp) {
     if (op != topOp)
       addDataToHash(hasher, op->getParentOp());
     //   - Attributes
-    addDataToHash(hasher, op->getDiscardableAttrDictionary());
+    addDataToHash(hasher, op->getRawDictionaryAttrs());
     //   - Properties
     addDataToHash(hasher, op->hashProperties());
     //   - Blocks in Regions
diff --git a/mlir/lib/Interfaces/InferTypeOpInterface.cpp b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
index ee4c0519b9f54..e52d0e17cda22 100644
--- a/mlir/lib/Interfaces/InferTypeOpInterface.cpp
+++ b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
@@ -240,9 +240,8 @@ LogicalResult mlir::detail::verifyInferredResultTypes(Operation *op) {
   auto retTypeFn = cast<InferTypeOpInterface>(op);
   auto result = retTypeFn.refineReturnTypes(
       op->getContext(), op->getLoc(), op->getOperands(),
-      op->getPropertiesStorage() ? op->getDiscardableAttrDictionary()
-                                 : op->getAttrDictionary(),
-      op->getPropertiesStorage(), op->getRegions(), inferredReturnTypes);
+      op->getRawDictionaryAttrs(), op->getPropertiesStorage(), op->getRegions(),
+      inferredReturnTypes);
   if (failed(result))
     op->emitOpError() << "failed to infer returned types";
 
diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp
index 76ff1a8194db7..bb7241c2d5196 100644
--- a/mlir/unittests/Bytecode/BytecodeTest.cpp
+++ b/mlir/unittests/Bytecode/BytecodeTest.cpp
@@ -144,4 +144,7 @@ TEST(Bytecode, OpWithoutProperties) {
   EXPECT_EQ(roundtripped->getAttrs().size(), 2u);
   EXPECT_TRUE(roundtripped->getInherentAttr("inherent_attr") != std::nullopt);
   EXPECT_TRUE(roundtripped->getDiscardableAttr("other_attr") != Attribute());
+
+  EXPECT_TRUE(OperationEquivalence::computeHash(op.get()) ==
+              OperationEquivalence::computeHash(roundtripped));
 }
diff --git a/mlir/unittests/IR/OpPropertiesTest.cpp b/mlir/unittests/IR/OpPropertiesTest.cpp
index bb1b741d1cc22..365775d541ec3 100644
--- a/mlir/unittests/IR/OpPropertiesTest.cpp
+++ b/mlir/unittests/IR/OpPropertiesTest.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/Parser/Parser.h"
 #include "gtest/gtest.h"
 #include <optional>
@@ -401,6 +402,15 @@ TEST(OpPropertiesTest, withoutPropertiesDiscardableAttrs) {
   op->print(os);
   EXPECT_TRUE(StringRef(os.str()).contains("inherent_attr = 42"));
   EXPECT_TRUE(StringRef(os.str()).contains("other_attr = 56"));
+
+  OwningOpRef<Operation *> reparsed = parseSourceString(os.str(), config);
+  auto trivialHash = [](Value v) { return hash_value(v); };
+  auto hash = [&](Operation *operation) {
+    return OperationEquivalence::computeHash(
+        operation, trivialHash, trivialHash,
+        OperationEquivalence::Flags::IgnoreLocations);
+  };
+  EXPECT_TRUE(hash(op.get()) == hash(reparsed.get()));
 }
 
 } // namespace

From a87fa7f0caa1b8bf328524e0aec80a10ed4af7f9 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel@gmail.com>
Date: Thu, 4 Jan 2024 13:04:31 -0300
Subject: [PATCH 244/313] [InstCombine] Dont throw away noalias/alias scope
 metadata when inlining memcpys (#74805)

This was found in julia when we changed some operations from explicit
loads + stores to memcpys. While applying it to both the src and the
dest seems weird, thats what we do for normal TBAA.
---
 .../Transforms/InstCombine/InstCombineCalls.cpp  | 16 +++++++---------
 .../Transforms/InstCombine/memcpy-to-load.ll     | 16 ++++++++++++++++
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3da9b89a6409c..40b48699f7585 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -172,10 +172,10 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 
   // If the memcpy has metadata describing the members, see if we can get the
   // TBAA tag describing our copy.
-  MDNode *CopyMD = nullptr;
-  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
-    CopyMD = M;
-  } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+  AAMDNodes AACopyMD = MI->getAAMetadata();
+
+  if (MDNode *M = AACopyMD.TBAAStruct) {
+    AACopyMD.TBAAStruct = nullptr;
     if (M->getNumOperands() == 3 && M->getOperand(0) &&
         mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
         mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
@@ -184,7 +184,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
         mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
         Size &&
         M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
-      CopyMD = cast<MDNode>(M->getOperand(2));
+      AACopyMD.TBAA = cast<MDNode>(M->getOperand(2));
   }
 
   Value *Src = MI->getArgOperand(1);
@@ -192,8 +192,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   LoadInst *L = Builder.CreateLoad(IntType, Src);
   // Alignment from the mem intrinsic will be better, so use it.
   L->setAlignment(*CopySrcAlign);
-  if (CopyMD)
-    L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+  L->setAAMetadata(AACopyMD);
   MDNode *LoopMemParallelMD =
     MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
   if (LoopMemParallelMD)
@@ -205,8 +204,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   StoreInst *S = Builder.CreateStore(L, Dest);
   // Alignment from the mem intrinsic will be better, so use it.
   S->setAlignment(*CopyDstAlign);
-  if (CopyMD)
-    S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+  S->setAAMetadata(AACopyMD);
   if (LoopMemParallelMD)
     S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
   if (AccessGroupMD)
diff --git a/llvm/test/Transforms/InstCombine/memcpy-to-load.ll b/llvm/test/Transforms/InstCombine/memcpy-to-load.ll
index 2f13394b5e477..f8e8ab85f935a 100644
--- a/llvm/test/Transforms/InstCombine/memcpy-to-load.ll
+++ b/llvm/test/Transforms/InstCombine/memcpy-to-load.ll
@@ -79,3 +79,19 @@ define void @copy_16_bytes(ptr %d, ptr %s) {
   ret void
 }
 
+define void @copy_8_bytes_noalias(ptr %d, ptr %s) {
+; CHECK-LABEL: @copy_8_bytes_noalias(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S:%.*]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[D:%.*]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.p0.p0.i32(ptr %d, ptr %s, i32 8, i1 false), !alias.scope !4, !noalias !5
+  ret void
+}
+
+!0 = distinct !{!0, !"The domain"}
+!1 = distinct !{!1}
+!2 = !{!2, !0}
+!3 = !{!3, !1}
+!4 = !{!2}
+!5 = !{!3}

From 9215741726e295d09ae7db4d235b26c1214a19ae Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Thu, 4 Jan 2024 11:08:36 -0500
Subject: [PATCH 245/313] [mlir] Make fold result type check more verbose
 (#76867)

Print the op and its types when the fold type check fails. This is to
speed up debuging as it should be trivial to map the offending op to its
folder based on the op name.
---
 mlir/lib/IR/Operation.cpp | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index a726790391a0c..311f5bb5ef77c 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Interfaces/FoldInterfaces.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <numeric>
 #include <optional>
 
@@ -611,11 +612,19 @@ void Operation::setSuccessor(Block *block, unsigned index) {
 /// the results of the given op.
 static void checkFoldResultTypes(Operation *op,
                                  SmallVectorImpl<OpFoldResult> &results) {
-  if (!results.empty())
-    for (auto [ofr, opResult] : llvm::zip_equal(results, op->getResults()))
-      if (auto value = ofr.dyn_cast<Value>())
-        assert(value.getType() == opResult.getType() &&
-               "folder produced value of incorrect type");
+  if (results.empty())
+    return;
+
+  for (auto [ofr, opResult] : llvm::zip_equal(results, op->getResults())) {
+    if (auto value = dyn_cast<Value>(ofr)) {
+      if (value.getType() != opResult.getType()) {
+        op->emitOpError() << "folder produced a value of incorrect type: "
+                          << opResult.getType()
+                          << ", expected: " << value.getType();
+        assert(false && "incorrect fold result type");
+      }
+    }
+  }
 }
 #endif // NDEBUG
 

From 640ef55bbbc081b72a87f71cab1bce08762e48b0 Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Thu, 4 Jan 2024 11:30:48 -0500
Subject: [PATCH 246/313] Reapply "[Clang][Sema] Diagnose unexpanded packs in
 the template argument lists of function template specializations" (#76876)
 (#76915)

This reapplies f034044ad94d6f7ccec13d89f08acac257ed28bb after it was
reverted by 687396b5f4ba0713d103ebd172b308e92eb930cc due to a test
failure in clang-doc.

The test in question declares a partial specialization of a function
template, as well as an explicit specialization of the same function
template. Both declarations are now set as invalid, meaning neither is
emitted by clang-doc.

Since this is the sole test of function template specializations in
clang-doc, I presume the intent is for the partial specialization to
actually be the primary template. Doing so results in the expected
output.
---
 .../test/clang-doc/templates.cpp              |  2 +-
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaDecl.cpp                   | 89 +++++++++----------
 .../CXX/temp/temp.decls/temp.variadic/p5.cpp  | 12 +++
 4 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/clang-tools-extra/test/clang-doc/templates.cpp b/clang-tools-extra/test/clang-doc/templates.cpp
index eb7f4599629f4..2e04a77ac9e62 100644
--- a/clang-tools-extra/test/clang-doc/templates.cpp
+++ b/clang-tools-extra/test/clang-doc/templates.cpp
@@ -7,7 +7,7 @@
 // RUN: rm -rf %t
 
 template<typename T, int U = 1>
-void function<bool, 0>(T x) {}
+void function(T x) {}
 
 template<>
 void function<bool, 0>(bool x) {}
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 30cfe66703f5a..c7bf162426a68 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -518,6 +518,7 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses definitions of friend function specializations, e.g. ``friend void f<>(int) {}``.
 - Clang now diagnoses narrowing conversions involving const references.
   (`#63151: <https://github.com/llvm/llvm-project/issues/63151>`_).
+- Clang now diagnoses unexpanded packs within the template argument lists of function template specializations.
 
 
 Improvements to Clang's time-trace
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 2de631941325f..8e46c4984d93d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9900,15 +9900,15 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     // Match up the template parameter lists with the scope specifier, then
     // determine whether we have a template or a template specialization.
     bool Invalid = false;
+    TemplateIdAnnotation *TemplateId =
+        D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId
+            ? D.getName().TemplateId
+            : nullptr;
     TemplateParameterList *TemplateParams =
         MatchTemplateParametersToScopeSpecifier(
             D.getDeclSpec().getBeginLoc(), D.getIdentifierLoc(),
-            D.getCXXScopeSpec(),
-            D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId
-                ? D.getName().TemplateId
-                : nullptr,
-            TemplateParamLists, isFriend, isMemberSpecialization,
-            Invalid);
+            D.getCXXScopeSpec(), TemplateId, TemplateParamLists, isFriend,
+            isMemberSpecialization, Invalid);
     if (TemplateParams) {
       // Check that we can declare a template here.
       if (CheckTemplateDeclScope(S, TemplateParams))
@@ -9921,6 +9921,11 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
         if (Name.getNameKind() == DeclarationName::CXXDestructorName) {
           Diag(NewFD->getLocation(), diag::err_destructor_template);
           NewFD->setInvalidDecl();
+          // Function template with explicit template arguments.
+        } else if (TemplateId) {
+          Diag(D.getIdentifierLoc(), diag::err_function_template_partial_spec)
+              << SourceRange(TemplateId->LAngleLoc, TemplateId->RAngleLoc);
+          NewFD->setInvalidDecl();
         }
 
         // If we're adding a template to a dependent context, we may need to
@@ -9973,6 +9978,11 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
             << FixItHint::CreateRemoval(RemoveRange)
             << FixItHint::CreateInsertion(InsertLoc, "<>");
           Invalid = true;
+
+          // Recover by faking up an empty template argument list.
+          HasExplicitTemplateArgs = true;
+          TemplateArgs.setLAngleLoc(InsertLoc);
+          TemplateArgs.setRAngleLoc(InsertLoc);
         }
       }
     } else {
@@ -9986,6 +9996,33 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
       if (TemplateParamLists.size() > 0)
         // For source fidelity, store all the template param lists.
         NewFD->setTemplateParameterListsInfo(Context, TemplateParamLists);
+
+      // "friend void foo<>(int);" is an implicit specialization decl.
+      if (isFriend && TemplateId)
+        isFunctionTemplateSpecialization = true;
+    }
+
+    // If this is a function template specialization and the unqualified-id of
+    // the declarator-id is a template-id, convert the template argument list
+    // into our AST format and check for unexpanded packs.
+    if (isFunctionTemplateSpecialization && TemplateId) {
+      HasExplicitTemplateArgs = true;
+
+      TemplateArgs.setLAngleLoc(TemplateId->LAngleLoc);
+      TemplateArgs.setRAngleLoc(TemplateId->RAngleLoc);
+      ASTTemplateArgsPtr TemplateArgsPtr(TemplateId->getTemplateArgs(),
+                                         TemplateId->NumArgs);
+      translateTemplateArguments(TemplateArgsPtr, TemplateArgs);
+
+      // FIXME: Should we check for unexpanded packs if this was an (invalid)
+      // declaration of a function template partial specialization? Should we
+      // consider the unexpanded pack context to be a partial specialization?
+      for (const TemplateArgumentLoc &ArgLoc : TemplateArgs.arguments()) {
+        if (DiagnoseUnexpandedParameterPack(
+                ArgLoc, isFriend ? UPPC_FriendDeclaration
+                                 : UPPC_ExplicitSpecialization))
+          NewFD->setInvalidDecl();
+      }
     }
 
     if (Invalid) {
@@ -10438,46 +10475,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
            diag::ext_operator_new_delete_declared_inline)
         << NewFD->getDeclName();
 
-    // If the declarator is a template-id, translate the parser's template
-    // argument list into our AST format.
-    if (D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId) {
-      TemplateIdAnnotation *TemplateId = D.getName().TemplateId;
-      TemplateArgs.setLAngleLoc(TemplateId->LAngleLoc);
-      TemplateArgs.setRAngleLoc(TemplateId->RAngleLoc);
-      ASTTemplateArgsPtr TemplateArgsPtr(TemplateId->getTemplateArgs(),
-                                         TemplateId->NumArgs);
-      translateTemplateArguments(TemplateArgsPtr,
-                                 TemplateArgs);
-
-      HasExplicitTemplateArgs = true;
-
-      if (NewFD->isInvalidDecl()) {
-        HasExplicitTemplateArgs = false;
-      } else if (FunctionTemplate) {
-        // Function template with explicit template arguments.
-        Diag(D.getIdentifierLoc(), diag::err_function_template_partial_spec)
-          << SourceRange(TemplateId->LAngleLoc, TemplateId->RAngleLoc);
-
-        HasExplicitTemplateArgs = false;
-      } else if (isFriend) {
-        // "friend void foo<>(int);" is an implicit specialization decl.
-        isFunctionTemplateSpecialization = true;
-      } else {
-        assert(isFunctionTemplateSpecialization &&
-               "should have a 'template<>' for this decl");
-      }
-    } else if (isFriend && isFunctionTemplateSpecialization) {
-      // This combination is only possible in a recovery case;  the user
-      // wrote something like:
-      //   template <> friend void foo(int);
-      // which we're recovering from as if the user had written:
-      //   friend void foo<>(int);
-      // Go ahead and fake up a template id.
-      HasExplicitTemplateArgs = true;
-      TemplateArgs.setLAngleLoc(D.getIdentifierLoc());
-      TemplateArgs.setRAngleLoc(D.getIdentifierLoc());
-    }
-
     // We do not add HD attributes to specializations here because
     // they may have different constexpr-ness compared to their
     // templates and, after maybeAddCUDAHostDeviceAttrs() is applied,
diff --git a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
index 30ce6b40e1fb5..3c500c2c4dc4a 100644
--- a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
+++ b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
@@ -376,6 +376,11 @@ namespace Specializations {
   template<typename... Ts>
   struct PrimaryClass<Ts>; // expected-error{{partial specialization contains unexpanded parameter pack 'Ts'}}
 
+  template<typename T, typename... Ts>
+  void PrimaryFunction();
+  template<typename T, typename... Ts>
+  void PrimaryFunction<Ts>(); // expected-error{{function template partial specialization is not allowed}}
+
 #if __cplusplus >= 201402L
   template<typename T, typename... Ts>
   constexpr int PrimaryVar = 0;
@@ -392,6 +397,13 @@ namespace Specializations {
     template<typename U>
     struct InnerClass<U, Ts>; // expected-error{{partial specialization contains unexpanded parameter pack 'Ts'}}
 
+    template<typename... Us>
+    void InnerFunction();
+    template<>
+    void InnerFunction<Ts>(); // expected-error{{explicit specialization contains unexpanded parameter pack 'Ts'}}
+
+    friend void PrimaryFunction<Ts>(); // expected-error{{friend declaration contains unexpanded parameter pack 'Ts'}}
+
 #if __cplusplus >= 201402L
     template<typename... Us>
     constexpr static int InnerVar = 0;

From 569ec185f5dc4a9e4a239948191977ecc2b2b475 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Thu, 4 Jan 2024 20:42:51 +0400
Subject: [PATCH 247/313] [llvm-cxxfilt] Added the option --no-params (#75348)

Added -p / --no-params flag to skip demangling function parameters
similar to how it is supported by GNU c++filt tool.

There are cases when users want to demangle a large number of symbols in
bulk, for example, at startup, and do not care about function parameters
and overloads at that time. Skipping the demangling of parameter types
led to a measurable improvement in performance. Our users reported about
15% speed up with GNU c++filt and we expect similar results with
llvm-cxxfilt with this patch.
---
 libcxxabi/src/demangle/ItaniumDemangle.h     | 22 +++++++++----
 llvm/docs/CommandGuide/llvm-cxxfilt.rst      |  4 +++
 llvm/include/llvm/Demangle/Demangle.h        |  5 +--
 llvm/include/llvm/Demangle/ItaniumDemangle.h | 22 +++++++++----
 llvm/lib/Demangle/Demangle.cpp               |  5 +--
 llvm/lib/Demangle/ItaniumDemangle.cpp        |  4 +--
 llvm/test/tools/llvm-cxxfilt/no-params.test  | 34 ++++++++++++++++++++
 llvm/tools/llvm-cxxfilt/Opts.td              |  2 ++
 llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp     | 10 ++++--
 9 files changed, 87 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/tools/llvm-cxxfilt/no-params.test

diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 90f25519fad1c..5a53a18bcc5fe 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -2794,7 +2794,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseClassEnumType();
   Node *parseQualifiedType();
 
-  Node *parseEncoding();
+  Node *parseEncoding(bool ParseParams = true);
   bool parseCallOffset();
   Node *parseSpecialName();
 
@@ -2911,7 +2911,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseDestructorName();
 
   /// Top-level entry point into the parser.
-  Node *parse();
+  Node *parse(bool ParseParams = true);
 };
 
 const char* parse_discriminator(const char* first, const char* last);
@@ -5405,7 +5405,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
 //            ::= <data name>
 //            ::= <special-name>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
   // The template parameters of an encoding are unrelated to those of the
   // enclosing context.
   SaveTemplateParams SaveTemplateParamsScope(this);
@@ -5431,6 +5431,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   if (IsEndOfEncoding())
     return Name;
 
+  // ParseParams may be false at the top level only, when called from parse().
+  // For example in the mangled name _Z3fooILZ3BarEET_f, ParseParams may be
+  // false when demangling 3fooILZ3BarEET_f but is always true when demangling
+  // 3Bar.
+  if (!ParseParams) {
+    while (consume())
+      ;
+    return Name;
+  }
+
   Node *Attrs = nullptr;
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
@@ -5895,9 +5905,9 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parse() {
+Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
   if (consumeIf("_Z") || consumeIf("__Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5911,7 +5921,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parse() {
   }
 
   if (consumeIf("___Z") || consumeIf("____Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
diff --git a/llvm/docs/CommandGuide/llvm-cxxfilt.rst b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
index 3f7deb1719511..0933f0b5bed87 100644
--- a/llvm/docs/CommandGuide/llvm-cxxfilt.rst
+++ b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
@@ -48,6 +48,10 @@ OPTIONS
 
   Print a summary of command line options.
 
+.. option:: --no-params, -p
+
+  Do not demangle function parameters or return types.
+
 .. option:: --no-strip-underscore, -n
 
   Do not strip a leading underscore. This is the default for all platforms
diff --git a/llvm/include/llvm/Demangle/Demangle.h b/llvm/include/llvm/Demangle/Demangle.h
index 70cfc1418f0c7..fe129603c0785 100644
--- a/llvm/include/llvm/Demangle/Demangle.h
+++ b/llvm/include/llvm/Demangle/Demangle.h
@@ -32,7 +32,7 @@ enum : int {
 /// Returns a non-NULL pointer to a NUL-terminated C style string
 /// that should be explicitly freed, if successful. Otherwise, may return
 /// nullptr if mangled_name is not a valid mangling or is nullptr.
-char *itaniumDemangle(std::string_view mangled_name);
+char *itaniumDemangle(std::string_view mangled_name, bool ParseParams = true);
 
 enum MSDemangleFlags {
   MSDF_None = 0,
@@ -68,7 +68,8 @@ char *dlangDemangle(std::string_view MangledName);
 std::string demangle(std::string_view MangledName);
 
 bool nonMicrosoftDemangle(std::string_view MangledName, std::string &Result,
-                          bool CanHaveLeadingDot = true);
+                          bool CanHaveLeadingDot = true,
+                          bool ParseParams = true);
 
 /// "Partial" demangler. This supports demangling a string into an AST
 /// (typically an intermediate stage in itaniumDemangle) and querying certain
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index e0ff035d47cfb..06956f47c1f0b 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -2793,7 +2793,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseClassEnumType();
   Node *parseQualifiedType();
 
-  Node *parseEncoding();
+  Node *parseEncoding(bool ParseParams = true);
   bool parseCallOffset();
   Node *parseSpecialName();
 
@@ -2910,7 +2910,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseDestructorName();
 
   /// Top-level entry point into the parser.
-  Node *parse();
+  Node *parse(bool ParseParams = true);
 };
 
 const char* parse_discriminator(const char* first, const char* last);
@@ -5404,7 +5404,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
 //            ::= <data name>
 //            ::= <special-name>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
   // The template parameters of an encoding are unrelated to those of the
   // enclosing context.
   SaveTemplateParams SaveTemplateParamsScope(this);
@@ -5430,6 +5430,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   if (IsEndOfEncoding())
     return Name;
 
+  // ParseParams may be false at the top level only, when called from parse().
+  // For example in the mangled name _Z3fooILZ3BarEET_f, ParseParams may be
+  // false when demangling 3fooILZ3BarEET_f but is always true when demangling
+  // 3Bar.
+  if (!ParseParams) {
+    while (consume())
+      ;
+    return Name;
+  }
+
   Node *Attrs = nullptr;
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
@@ -5894,9 +5904,9 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parse() {
+Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
   if (consumeIf("_Z") || consumeIf("__Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5910,7 +5920,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parse() {
   }
 
   if (consumeIf("___Z") || consumeIf("____Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
diff --git a/llvm/lib/Demangle/Demangle.cpp b/llvm/lib/Demangle/Demangle.cpp
index 83f3cdc88c01e..117b849d1c784 100644
--- a/llvm/lib/Demangle/Demangle.cpp
+++ b/llvm/lib/Demangle/Demangle.cpp
@@ -47,7 +47,8 @@ static bool isRustEncoding(std::string_view S) { return starts_with(S, "_R"); }
 static bool isDLangEncoding(std::string_view S) { return starts_with(S, "_D"); }
 
 bool llvm::nonMicrosoftDemangle(std::string_view MangledName,
-                                std::string &Result, bool CanHaveLeadingDot) {
+                                std::string &Result, bool CanHaveLeadingDot,
+                                bool ParseParams) {
   char *Demangled = nullptr;
 
   // Do not consider the dot prefix as part of the demangled symbol name.
@@ -57,7 +58,7 @@ bool llvm::nonMicrosoftDemangle(std::string_view MangledName,
   }
 
   if (isItaniumEncoding(MangledName))
-    Demangled = itaniumDemangle(MangledName);
+    Demangled = itaniumDemangle(MangledName, ParseParams);
   else if (isRustEncoding(MangledName))
     Demangled = rustDemangle(MangledName);
   else if (isDLangEncoding(MangledName))
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index e3f208f0adf8d..5c21b06a1d095 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -366,13 +366,13 @@ class DefaultAllocator {
 
 using Demangler = itanium_demangle::ManglingParser<DefaultAllocator>;
 
-char *llvm::itaniumDemangle(std::string_view MangledName) {
+char *llvm::itaniumDemangle(std::string_view MangledName, bool ParseParams) {
   if (MangledName.empty())
     return nullptr;
 
   Demangler Parser(MangledName.data(),
                    MangledName.data() + MangledName.length());
-  Node *AST = Parser.parse();
+  Node *AST = Parser.parse(ParseParams);
   if (!AST)
     return nullptr;
 
diff --git a/llvm/test/tools/llvm-cxxfilt/no-params.test b/llvm/test/tools/llvm-cxxfilt/no-params.test
new file mode 100644
index 0000000000000..cf1eac27aa02d
--- /dev/null
+++ b/llvm/test/tools/llvm-cxxfilt/no-params.test
@@ -0,0 +1,34 @@
+RUN: llvm-cxxfilt             -n _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-PARAMS
+RUN: llvm-cxxfilt          -p -n _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-NO-PARAMS --match-full-lines
+RUN: llvm-cxxfilt --no-params -n _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-NO-PARAMS --match-full-lines
+
+# Check that -p or --no-params flag omits function parameters and the return
+# type.
+
+CHECK-PARAMS: Bar foo<Bar>(float)
+CHECK-NO-PARAMS: foo<Bar>
+
+# Check that only the top-level function is impacted by the switch, and that
+# nested function types in the encoding (e.g. where a function type is being
+# used as a template parameter) still include their parameters.
+#
+# template <typename T> T foo(double);
+# typedef char (*F)(float);
+# F foo<F>(double)
+
+CHECK-PARAMS: char (*foo<char (*)(float)>(double))(float)
+CHECK-NO-PARAMS: foo<char (*)(float)>
+
+# Use an invalid mangled name broken in the function parameters to check how -p
+# or --no-params flag works. If the option is given we should be able to
+# demangle the function name just fine. If it is not given, demangling will fail
+# because of the invalid params.
+
+CHECK-PARAMS: _ZN1f2baC2ERKNS_2baIT_EE
+CHECK-NO-PARAMS: f::ba::ba
+
+# Check that a vendor specific suffix is also omitted when --no-params is
+# specified. This matches c++filt's behaviour.
+
+CHECK-PARAMS: foo() (.123)
+CHECK-NO-PARAMS: foo
diff --git a/llvm/tools/llvm-cxxfilt/Opts.td b/llvm/tools/llvm-cxxfilt/Opts.td
index f652a1a7f88bb..034cb267aab80 100644
--- a/llvm/tools/llvm-cxxfilt/Opts.td
+++ b/llvm/tools/llvm-cxxfilt/Opts.td
@@ -17,6 +17,7 @@ multiclass Eq<string name, string help> {
 def help : FF<"help", "Display this help">;
 defm strip_underscore : BB<"strip-underscore", "Strip the leading underscore", "Don't strip the leading underscore">;
 def types : FF<"types", "Attempt to demangle types as well as function names">;
+def no_params : FF<"no-params", "Skip function parameters and return types">;
 def version : FF<"version", "Display the version">;
 
 defm : Eq<"format", "Specify mangling format. Currently ignored because only 'gnu' is supported">;
@@ -25,4 +26,5 @@ def : F<"s", "Alias for --format">;
 def : F<"_", "Alias for --strip-underscore">, Alias<strip_underscore>;
 def : F<"h", "Alias for --help">, Alias<help>;
 def : F<"n", "Alias for --no-strip-underscore">, Alias<no_strip_underscore>;
+def : F<"p", "Alias for --no-params">, Alias<no_params>;
 def : F<"t", "Alias for --types">, Alias<types>;
diff --git a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
index 4b9d88a650666..26a1f2f4afebd 100644
--- a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
+++ b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
@@ -54,6 +54,7 @@ class CxxfiltOptTable : public opt::GenericOptTable {
 };
 } // namespace
 
+static bool ParseParams;
 static bool StripUnderscore;
 static bool Types;
 
@@ -74,18 +75,19 @@ static std::string demangle(const std::string &Mangled) {
   }
 
   std::string Result;
-  if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot))
+  if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot,
+                           ParseParams))
     return Result;
 
   std::string Prefix;
   char *Undecorated = nullptr;
 
   if (Types)
-    Undecorated = itaniumDemangle(DecoratedStr);
+    Undecorated = itaniumDemangle(DecoratedStr, ParseParams);
 
   if (!Undecorated && starts_with(DecoratedStr, "__imp_")) {
     Prefix = "import thunk for ";
-    Undecorated = itaniumDemangle(DecoratedStr.substr(6));
+    Undecorated = itaniumDemangle(DecoratedStr.substr(6), ParseParams);
   }
 
   Result = Undecorated ? Prefix + Undecorated : Mangled;
@@ -173,6 +175,8 @@ int llvm_cxxfilt_main(int argc, char **argv, const llvm::ToolContext &) {
   else
     StripUnderscore = Triple(sys::getProcessTriple()).isOSBinFormatMachO();
 
+  ParseParams = !Args.hasArg(OPT_no_params);
+
   Types = Args.hasArg(OPT_types);
 
   std::vector<std::string> Decorated = Args.getAllArgValues(OPT_INPUT);

From 8f8152091cae186b35f73331df13f36b0f905eb4 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Thu, 4 Jan 2024 16:50:31 +0000
Subject: [PATCH 248/313] [Clang][SME] Add IsStreamingOrSVE2p1 (#75958)

This patch adds IsStreamingOrSVE2p1 to the applicable builtins and a
warning for when those builtins are not used in a streaming or sve2p1
function.
---
 clang/include/clang/Basic/arm_sve.td          | 123 ++++++++--------
 clang/include/clang/Basic/arm_sve_sme_incl.td |   1 +
 clang/lib/Basic/Targets/AArch64.h             |   1 +
 clang/lib/Sema/SemaChecking.cpp               |  17 ++-
 .../aarch64-sve2-intrinsics/acle_sve2_revd.c  |   4 +-
 .../acle_sve2p1_bfmlsl.c                      |   8 +-
 .../acle_sve2p1_cntp.c                        |  27 ++--
 .../acle_sve2p1_fclamp.c                      |  14 +-
 .../acle_sve2p1_ld1.c                         |   2 +
 .../acle_sve2p1_pext.c                        |   5 +-
 .../acle_sve2p1_pfalse.c                      |  14 +-
 .../acle_sve2p1_psel.c                        |  26 +++-
 .../acle_sve2p1_sclamp.c                      |  20 ++-
 .../acle_sve2p1_stnt1.c                       |   2 -
 .../acle_sve2p1_uclamp.c                      |  20 ++-
 .../acle_sve2p1_while_pn.c                    | 137 +++++++++---------
 .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp |   2 +-
 .../Sema/aarch64-sme2-sve2p1-diagnostics.c    |  37 +++++
 clang/utils/TableGen/SveEmitter.cpp           |   3 +
 19 files changed, 289 insertions(+), 174 deletions(-)
 create mode 100644 clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 91f62c4c76339..17ee82d9304ff 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1296,9 +1296,9 @@ def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd",  "b", MergeNone, "", [IsT
 def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate]>;
 }
 
-let TargetGuard = "sve2p1" in {
-  def SVCREATE_2_B : SInst<"svcreate2[_{d}]", "2dd",   "Pc", MergeNone, "", [IsTupleCreate]>;
-  def SVCREATE_4_B : SInst<"svcreate4[_{d}]", "4dddd", "Pc", MergeNone, "", [IsTupleCreate]>;
+let TargetGuard = "sve2p1|sme2" in {
+  def SVCREATE_2_B : SInst<"svcreate2[_{d}]", "2dd",   "Pc", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
+  def SVCREATE_4_B : SInst<"svcreate4[_{d}]", "4dddd", "Pc", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1321,7 +1321,7 @@ def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet
 def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sve2p1" in {
+let TargetGuard = "sve2p1|sme2" in {
   def SVGET_2_B : SInst<"svget2[_{d}]", "d2i", "Pc", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>;
   def SVGET_4_B : SInst<"svget4[_{d}]", "d4i", "Pc", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>;
 
@@ -1976,39 +1976,37 @@ def SVFMINQV: SInst<"svminqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminq
 }
 
 let TargetGuard = "sve2p1|sme2" in {
-//FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available
-def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>;
-def SVPEXT_X2     : SInst<"svpext_lane_{d}_x2", "2.P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>;
+def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [IsStreamingOrSVE2p1], [ImmCheck<1, ImmCheck0_3>]>;
+def SVPEXT_X2     : SInst<"svpext_lane_{d}_x2", "2.P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext_x2", [IsStreamingOrSVE2p1], [ImmCheck<1, ImmCheck0_1>]>;
 
-def SVWHILEGE_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilege_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILEGT_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilegt_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILELE_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilele_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILELT_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilelt_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILELO_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilelo_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILELS_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilels_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILEHI_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehi_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILEHS_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehs_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILEGE_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilege_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILEGT_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilegt_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILELE_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilele_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILELT_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilelt_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILELO_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilelo_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILELS_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilels_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILEHI_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehi_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILEHS_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehs_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
 }
 
 multiclass MultiVecLoad<string i> {
-  // FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available (SME2 requires __arm_streaming)
-  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-
-  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "cUc",   [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "iUif",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "lUld",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "cUc",   [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "iUif",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "lUld",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+
+  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "cUc",   [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "iUif",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "lUld",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "cUc",   [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "iUif",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "lUld",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
 }
 
 let TargetGuard = "sve2p1|sme2" in {
@@ -2017,24 +2015,23 @@ let TargetGuard = "sve2p1|sme2" in {
 }
 
 multiclass MultiVecStore<string i> {
-  // FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available (SME2 requires __arm_streaming)
-  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-
-  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "cUc",   [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "iUif",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "lUld",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "cUc",   [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "iUif",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "lUld",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+
+  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "cUc",   [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "iUif",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "lUld",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "cUc",   [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "iUif",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "lUld",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
 }
 
 let TargetGuard = "sve2p1|sme2" in {
@@ -2051,21 +2048,20 @@ def SVDOT_LANE_X2_U : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "Ui", MergeNone
 def SVDOT_LANE_X2_F : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "f",  MergeNone, "aarch64_sve_fdot_lane_x2", [], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sve2p1|sme" in {
-def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sclamp", [], []>;
-def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [], []>;
+let TargetGuard = "sve2p1|sme2" in {
+def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sclamp", [IsStreamingOrSVE2p1], []>;
+def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [IsStreamingOrSVE2p1], []>;
 
 defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUlbhfd", "aarch64_sve_revd">;
 }
 
 let TargetGuard = "sve2p1|sme2" in {
-  //FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available
-  def SVPTRUE_COUNT  : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_sve_ptrue_{d}", [IsOverloadNone, IsStreamingCompatible], []>;
+  def SVPTRUE_COUNT  : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_sve_ptrue_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], []>;
 
-  def SVPFALSE_COUNT_ALIAS : SInst<"svpfalse_c", "}v", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>;
+  def SVPFALSE_COUNT_ALIAS : SInst<"svpfalse_c", "}v", "", MergeNone, "", [IsOverloadNone, IsStreamingOrSVE2p1]>;
 
-  def SVFCLAMP   : SInst<"svclamp[_{d}]", "dddd", "hfd", MergeNone, "aarch64_sve_fclamp", [IsStreamingCompatible], []>;
-  def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<1, ImmCheck2_4_Mul2>]>;
+  def SVFCLAMP   : SInst<"svclamp[_{d}]", "dddd", "hfd", MergeNone, "aarch64_sve_fclamp", [IsStreamingOrSVE2p1], []>;
+  def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<1, ImmCheck2_4_Mul2>]>;
 }
 
 let TargetGuard = "(sve2|sme2),b16b16" in {
@@ -2326,10 +2322,9 @@ let TargetGuard = "sme2" in {
 
 let TargetGuard = "sve2p1|sme2" in {
 // == BFloat16 multiply-subtract ==
-// FIXME: Make all of these IsStreamingOrSVE2p1 once that is added
-  def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, IsStreamingCompatible], []>;
-  def SVBFMLSLT : SInst<"svbfmlslt[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslt", [IsOverloadNone, IsStreamingCompatible], []>;
+  def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, IsStreamingOrSVE2p1], []>;
+  def SVBFMLSLT : SInst<"svbfmlslt[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslt", [IsOverloadNone, IsStreamingOrSVE2p1], []>;
 
-  def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<3, ImmCheck0_7>]>;
 }
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 0dba8493bad2d..ad29864440c96 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -227,6 +227,7 @@ def IsPreservesZA             : FlagType<0x10000000000>;
 def IsReadZA                  : FlagType<0x20000000000>;
 def IsWriteZA                 : FlagType<0x40000000000>;
 def IsReductionQV             : FlagType<0x80000000000>;
+def IsStreamingOrSVE2p1       : FlagType<0x80000000000>; // Use for intrinsics that are common between sme/sme2 and sve2p1.
 
 // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
 class ImmCheckType<int val> {
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index f0e0782e7abe9..e38fa5af56598 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -50,6 +50,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasMatMul = false;
   bool HasBFloat16 = false;
   bool HasSVE2 = false;
+  bool HasSVE2p1 = false;
   bool HasSVE2AES = false;
   bool HasSVE2SHA3 = false;
   bool HasSVE2SM4 = false;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 3168d38dd66c3..f13164dc06386 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2998,7 +2998,12 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context,
   llvm_unreachable("Invalid NeonTypeFlag!");
 }
 
-enum ArmStreamingType { ArmNonStreaming, ArmStreaming, ArmStreamingCompatible };
+enum ArmStreamingType {
+  ArmNonStreaming,
+  ArmStreaming,
+  ArmStreamingCompatible,
+  ArmStreamingOrSVE2p1
+};
 
 bool Sema::ParseSVEImmChecks(
     CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 3> &ImmChecks) {
@@ -3156,6 +3161,16 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
                                      const FunctionDecl *FD,
                                      ArmStreamingType BuiltinType) {
   ArmStreamingType FnType = getArmStreamingFnType(FD);
+  if (BuiltinType == ArmStreamingOrSVE2p1) {
+    // Check intrinsics that are available in [sve2p1 or sme/sme2].
+    llvm::StringMap<bool> CallerFeatureMap;
+    S.Context.getFunctionFeatureMap(CallerFeatureMap, FD);
+    if (Builtin::evaluateRequiredTargetFeatures("sve2p1", CallerFeatureMap))
+      BuiltinType = ArmStreamingCompatible;
+    else
+      BuiltinType = ArmStreaming;
+  }
+
   if (FnType == ArmStreaming && BuiltinType == ArmNonStreaming) {
     S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin)
         << TheCall->getSourceRange() << "streaming";
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
index 74a90583a173a..d82d69442b8ff 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
index c1d14e16ad17b..22d951c069bc8 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
@@ -2,20 +2,20 @@
 // REQUIRES: aarch64-registered-target
 
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -target-feature -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
 #ifndef TEST_SME2
 #define ATTR
 #else
-#define ATTR __arm_streaming_compatible
+#define ATTR __arm_streaming
 #endif
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
index 56b1d99262214..9bf55eaa6a08b 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
@@ -3,10 +3,19 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svcntp_c8_vlx2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 2)
@@ -17,7 +26,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c8_vlx2(svcount_t pnn) {
+uint64_t test_svcntp_c8_vlx2(svcount_t pnn) ATTR {
   return svcntp_c8(pnn, 2);
 }
 
@@ -31,7 +40,7 @@ uint64_t test_svcntp_c8_vlx2(svcount_t pnn) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c8_vlx4(svcount_t pnn) {
+uint64_t test_svcntp_c8_vlx4(svcount_t pnn) ATTR {
   return svcntp_c8(pnn, 4);
 }
 
@@ -45,7 +54,7 @@ uint64_t test_svcntp_c8_vlx4(svcount_t pnn) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c16_vlx2(svcount_t pnn) {
+uint64_t test_svcntp_c16_vlx2(svcount_t pnn) ATTR {
   return svcntp_c16(pnn, 2);
 }
 
@@ -59,7 +68,7 @@ uint64_t test_svcntp_c16_vlx2(svcount_t pnn) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c16_vlx4(svcount_t pnn) {
+uint64_t test_svcntp_c16_vlx4(svcount_t pnn) ATTR {
   return svcntp_c16(pnn, 4);
 }
 
@@ -73,7 +82,7 @@ uint64_t test_svcntp_c16_vlx4(svcount_t pnn) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c32_vlx2(svcount_t pnn) {
+uint64_t test_svcntp_c32_vlx2(svcount_t pnn) ATTR {
   return svcntp_c32(pnn, 2);
 }
 
@@ -87,7 +96,7 @@ uint64_t test_svcntp_c32_vlx2(svcount_t pnn) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c32_vlx4(svcount_t pnn) {
+uint64_t test_svcntp_c32_vlx4(svcount_t pnn) ATTR {
   return svcntp_c32(pnn, 4);
 }
 
@@ -101,7 +110,7 @@ uint64_t test_svcntp_c32_vlx4(svcount_t pnn) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c64_vlx2(svcount_t pnn) {
+uint64_t test_svcntp_c64_vlx2(svcount_t pnn) ATTR {
   return svcntp_c64(pnn, 2);
 }
 
@@ -115,6 +124,6 @@ uint64_t test_svcntp_c64_vlx2(svcount_t pnn) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c64_vlx4(svcount_t pnn) {
+uint64_t test_svcntp_c64_vlx4(svcount_t pnn) ATTR {
   return svcntp_c64(pnn, 4);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
index 5d8c5b7b8a18c..7687257701a6e 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
@@ -11,10 +11,16 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
-// RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN:   -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
@@ -32,7 +38,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fclamp.nxv8f16(<vscale x 8 x half> [[OP1:%.*]], <vscale x 8 x half> [[OP2:%.*]], <vscale x 8 x half> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svclamp_f16(svfloat16_t op1, svfloat16_t op2, svfloat16_t op3) {
+svfloat16_t test_svclamp_f16(svfloat16_t op1, svfloat16_t op2, svfloat16_t op3) ATTR {
   return SVE_ACLE_FUNC(svclamp, _f16, , )(op1, op2, op3);
 }
 
@@ -46,7 +52,7 @@ svfloat16_t test_svclamp_f16(svfloat16_t op1, svfloat16_t op2, svfloat16_t op3)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fclamp.nxv4f32(<vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x float> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svclamp_f32(svfloat32_t op1, svfloat32_t op2, svfloat32_t op3) {
+svfloat32_t test_svclamp_f32(svfloat32_t op1, svfloat32_t op2, svfloat32_t op3) ATTR {
   return SVE_ACLE_FUNC(svclamp, _f32, , )(op1, op2, op3);
 }
 
@@ -60,7 +66,7 @@ svfloat32_t test_svclamp_f32(svfloat32_t op1, svfloat32_t op2, svfloat32_t op3)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fclamp.nxv2f64(<vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x double> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svclamp_f64(svfloat64_t op1, svfloat64_t op2, svfloat64_t op3) {
+svfloat64_t test_svclamp_f64(svfloat64_t op1, svfloat64_t op2, svfloat64_t op3) ATTR {
   return SVE_ACLE_FUNC(svclamp, _f64, , )(op1, op2, op3);
 }
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
index 6f1231e776aa3..7657165d8b3f6 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
@@ -1,6 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wno-unknown-attributes -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
index a3206029019c3..8f08b32618b05 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
@@ -1,8 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -DTEST_SME2 -target-feature +sve -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
index 19993e5418128..afdb038fb9312 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
@@ -1,14 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svpfalse_c(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> zeroinitializer)
@@ -19,7 +25,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> zeroinitializer)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svpfalse_c(void) __arm_streaming_compatible
+svcount_t test_svpfalse_c(void) ATTR
 {
   return svpfalse_c();
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
index 73b7b0347dd97..de3f6a9a57bfe 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
@@ -10,9 +10,19 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming_compatible
+#endif
+
 // CHECK-LABEL: @test_svpsel_lane_b8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 15
@@ -25,7 +35,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv16i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 16 x i1> [[P2:%.*]], i32 [[ADD]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
 //
-svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
+svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
   return svpsel_lane_b8(p1, p2, idx + 15);
 }
 
@@ -43,7 +53,7 @@ svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) __arm_strea
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 8 x i1> [[TMP0]], i32 [[ADD]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
+svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
   return svpsel_lane_b16(p1, p2, idx + 7);
 }
 
@@ -61,7 +71,7 @@ svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) __arm_stre
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 4 x i1> [[TMP0]], i32 [[ADD]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
+svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
   return svpsel_lane_b32(p1, p2, idx + 3);
 }
 
@@ -79,7 +89,7 @@ svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) __arm_stre
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 2 x i1> [[TMP0]], i32 [[ADD]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
+svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
   return svpsel_lane_b64(p1, p2, idx + 1);
 }
 
@@ -99,7 +109,7 @@ svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) __arm_stre
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP1]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP2]]
 //
-svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
+svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
   return svpsel_lane_c8(p1, p2, idx + 15);
 }
 
@@ -121,7 +131,7 @@ svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) __arm_str
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
 //
-svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
+svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
   return svpsel_lane_c16(p1, p2, idx + 7);
 }
 
@@ -143,7 +153,7 @@ svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) __arm_st
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
 //
-svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
+svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
   return svpsel_lane_c32(p1, p2, idx + 3);
 }
 
@@ -165,6 +175,6 @@ svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) __arm_st
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
 //
-svcount_t test_svpsel_lane_c64(svcount_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
+svcount_t test_svpsel_lane_c64(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
   return svpsel_lane_c64(p1, p2, idx + 1);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
index 8c63a7455c79f..04869fd550ec1 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
@@ -10,9 +10,21 @@
 // RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:   -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
@@ -30,7 +42,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sclamp.nxv16i8(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) {
+svint8_t test_svclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) ATTR  {
   return SVE_ACLE_FUNC(svclamp, _s8, , )(op1, op2, op3);
 }
 
@@ -44,7 +56,7 @@ svint8_t test_svclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sclamp.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) {
+svint16_t test_svclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) ATTR  {
   return SVE_ACLE_FUNC(svclamp, _s16, , )(op1, op2, op3);
 }
 
@@ -58,7 +70,7 @@ svint16_t test_svclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sclamp.nxv4i32(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) {
+svint32_t test_svclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) ATTR  {
   return SVE_ACLE_FUNC(svclamp, _s32, , )(op1, op2, op3);
 }
 
@@ -72,7 +84,7 @@ svint32_t test_svclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sclamp.nxv2i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svclamp_s64(svint64_t op1, svint64_t op2, svint64_t op3) {
+svint64_t test_svclamp_s64(svint64_t op1, svint64_t op2, svint64_t op3) ATTR  {
   return SVE_ACLE_FUNC(svclamp, _s64, , )(op1, op2, op3);
 }
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
index 0d8696a7634a7..b1ca27b7b68a1 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
@@ -505,11 +505,9 @@ void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) ATTR
   return SVE_ACLE_FUNC(svstnt1,_f64_x4,,)(pn, base, v);
 }
 
-
 // == VNUM variants ==
 
 
-
 // CHECK-LABEL: @test_svstnt1_vnum_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
index b878986248877..37bfd4265a43a 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
@@ -10,9 +10,21 @@
 // RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:   -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
@@ -30,7 +42,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uclamp.nxv16i8(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) {
+svuint8_t test_svclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) ATTR  {
   return SVE_ACLE_FUNC(svclamp, _u8, , )(op1, op2, op3);
 }
 
@@ -44,7 +56,7 @@ svuint8_t test_svclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uclamp.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) {
+svuint16_t test_svclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) ATTR  {
   return SVE_ACLE_FUNC(svclamp, _u16, , )(op1, op2, op3);
 }
 
@@ -58,7 +70,7 @@ svuint16_t test_svclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uclamp.nxv4i32(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) {
+svuint32_t test_svclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) ATTR  {
   return SVE_ACLE_FUNC(svclamp, _u32, , )(op1, op2, op3);
 }
 
@@ -72,7 +84,7 @@ svuint32_t test_svclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uclamp.nxv2i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svclamp_u64(svuint64_t op1, svuint64_t op2, svuint64_t op3) {
+svuint64_t test_svclamp_u64(svuint64_t op1, svuint64_t op2, svuint64_t op3) ATTR  {
   return SVE_ACLE_FUNC(svclamp, _u64, , )(op1, op2, op3);
 }
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
index 143a43b4a9219..11ebec9e7cbf1 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -16,6 +16,11 @@
 #define SVE_ACLE_FUNC(A1, A2) A1##A2
 #endif
 
+#ifdef TEST_SME2
+#define ATTR __arm_streaming
+#else
+#define ATTR
+#endif
 
 // WHILEGE
 
@@ -29,7 +34,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c8_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilege_c8_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c8,_s64)(op1, op2, 2);
 }
@@ -44,7 +49,7 @@ svcount_t test_svwhilege_c8_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c8_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilege_c8_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c8,_s64)(op1, op2, 4);
 }
@@ -59,7 +64,7 @@ svcount_t test_svwhilege_c8_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c16_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilege_c16_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c16,_s64)(op1, op2, 2);
 }
@@ -74,7 +79,7 @@ svcount_t test_svwhilege_c16_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c16_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilege_c16_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c16,_s64)(op1, op2, 4);
 }
@@ -89,7 +94,7 @@ svcount_t test_svwhilege_c16_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c32_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilege_c32_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c32,_s64)(op1, op2, 2);
 }
@@ -104,7 +109,7 @@ svcount_t test_svwhilege_c32_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c32_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilege_c32_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c32,_s64)(op1, op2, 4);
 }
@@ -119,7 +124,7 @@ svcount_t test_svwhilege_c32_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c64_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilege_c64_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c64,_s64)(op1, op2, 2);
 }
@@ -134,7 +139,7 @@ svcount_t test_svwhilege_c64_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c64_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilege_c64_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c64,_s64)(op1, op2, 4);
 }
@@ -152,7 +157,7 @@ svcount_t test_svwhilege_c64_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c8_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilegt_c8_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c8,_s64)(op1, op2, 2);
 }
@@ -167,7 +172,7 @@ svcount_t test_svwhilegt_c8_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c8_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilegt_c8_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c8,_s64)(op1, op2, 4);
 }
@@ -182,7 +187,7 @@ svcount_t test_svwhilegt_c8_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c16_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilegt_c16_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c16,_s64)(op1, op2, 2);
 }
@@ -197,7 +202,7 @@ svcount_t test_svwhilegt_c16_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c16_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilegt_c16_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c16,_s64)(op1, op2, 4);
 }
@@ -212,7 +217,7 @@ svcount_t test_svwhilegt_c16_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c32_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilegt_c32_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c32,_s64)(op1, op2, 2);
 }
@@ -227,7 +232,7 @@ svcount_t test_svwhilegt_c32_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c32_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilegt_c32_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c32,_s64)(op1, op2, 4);
 }
@@ -242,7 +247,7 @@ svcount_t test_svwhilegt_c32_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c64_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilegt_c64_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c64,_s64)(op1, op2, 2);
 }
@@ -257,7 +262,7 @@ svcount_t test_svwhilegt_c64_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c64_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilegt_c64_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c64,_s64)(op1, op2, 4);
 }
@@ -275,7 +280,7 @@ svcount_t test_svwhilegt_c64_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c8_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehi_c8_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c8,_u64)(op1, op2, 2);
 }
@@ -290,7 +295,7 @@ svcount_t test_svwhilehi_c8_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c8_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehi_c8_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c8,_u64)(op1, op2, 4);
 }
@@ -305,7 +310,7 @@ svcount_t test_svwhilehi_c8_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c16_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehi_c16_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c16,_u64)(op1, op2, 2);
 }
@@ -320,7 +325,7 @@ svcount_t test_svwhilehi_c16_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c16_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehi_c16_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c16,_u64)(op1, op2, 4);
 }
@@ -335,7 +340,7 @@ svcount_t test_svwhilehi_c16_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c32_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehi_c32_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c32,_u64)(op1, op2, 2);
 }
@@ -350,7 +355,7 @@ svcount_t test_svwhilehi_c32_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c32_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehi_c32_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c32,_u64)(op1, op2, 4);
 }
@@ -365,7 +370,7 @@ svcount_t test_svwhilehi_c32_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c64_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehi_c64_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c64,_u64)(op1, op2, 2);
 }
@@ -380,7 +385,7 @@ svcount_t test_svwhilehi_c64_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c64_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehi_c64_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilegt_c64,_u64)(op1, op2, 4);
 }
@@ -398,7 +403,7 @@ svcount_t test_svwhilehi_c64_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c8_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehs_c8_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c8,_u64)(op1, op2, 2);
 }
@@ -413,7 +418,7 @@ svcount_t test_svwhilehs_c8_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c8_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehs_c8_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c8,_u64)(op1, op2, 4);
 }
@@ -428,7 +433,7 @@ svcount_t test_svwhilehs_c8_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c16_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehs_c16_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c16,_u64)(op1, op2, 2);
 }
@@ -443,7 +448,7 @@ svcount_t test_svwhilehs_c16_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c16_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehs_c16_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c16,_u64)(op1, op2, 4);
 }
@@ -458,7 +463,7 @@ svcount_t test_svwhilehs_c16_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c32_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehs_c32_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c32,_u64)(op1, op2, 2);
 }
@@ -473,7 +478,7 @@ svcount_t test_svwhilehs_c32_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c32_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehs_c32_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c32,_u64)(op1, op2, 4);
 }
@@ -488,7 +493,7 @@ svcount_t test_svwhilehs_c32_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c64_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehs_c64_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c64,_u64)(op1, op2, 2);
 }
@@ -503,7 +508,7 @@ svcount_t test_svwhilehs_c64_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c64_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilehs_c64_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilege_c64,_u64)(op1, op2, 4);
 }
@@ -521,7 +526,7 @@ svcount_t test_svwhilehs_c64_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c8_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilele_c8_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c8,_s64)(op1, op2, 2);
 }
@@ -536,7 +541,7 @@ svcount_t test_svwhilele_c8_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c8_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilele_c8_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c8,_s64)(op1, op2, 4);
 }
@@ -551,7 +556,7 @@ svcount_t test_svwhilele_c8_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c16_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilele_c16_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c16,_s64)(op1, op2, 2);
 }
@@ -566,7 +571,7 @@ svcount_t test_svwhilele_c16_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c16_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilele_c16_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c16,_s64)(op1, op2, 4);
 }
@@ -581,7 +586,7 @@ svcount_t test_svwhilele_c16_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c32_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilele_c32_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c32,_s64)(op1, op2, 2);
 }
@@ -596,7 +601,7 @@ svcount_t test_svwhilele_c32_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c32_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilele_c32_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c32,_s64)(op1, op2, 4);
 }
@@ -611,7 +616,7 @@ svcount_t test_svwhilele_c32_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c64_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilele_c64_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c64,_s64)(op1, op2, 2);
 }
@@ -626,7 +631,7 @@ svcount_t test_svwhilele_c64_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c64_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilele_c64_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c64,_s64)(op1, op2, 4);
 }
@@ -644,7 +649,7 @@ svcount_t test_svwhilele_c64_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c8_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilelo_c8_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c8,_u64)(op1, op2, 2);
 }
@@ -659,7 +664,7 @@ svcount_t test_svwhilelo_c8_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c8_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilelo_c8_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c8,_u64)(op1, op2, 4);
 }
@@ -674,7 +679,7 @@ svcount_t test_svwhilelo_c8_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c16_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilelo_c16_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c16,_u64)(op1, op2, 2);
 }
@@ -689,7 +694,7 @@ svcount_t test_svwhilelo_c16_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c16_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilelo_c16_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c16,_u64)(op1, op2, 4);
 }
@@ -704,7 +709,7 @@ svcount_t test_svwhilelo_c16_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c32_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilelo_c32_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c32,_u64)(op1, op2, 2);
 }
@@ -719,7 +724,7 @@ svcount_t test_svwhilelo_c32_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c32_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilelo_c32_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c32,_u64)(op1, op2, 4);
 }
@@ -734,7 +739,7 @@ svcount_t test_svwhilelo_c32_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c64_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilelo_c64_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c64,_u64)(op1, op2, 2);
 }
@@ -749,7 +754,7 @@ svcount_t test_svwhilelo_c64_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c64_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilelo_c64_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c64,_u64)(op1, op2, 4);
 }
@@ -767,7 +772,7 @@ svcount_t test_svwhilelo_c64_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c8_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilels_c8_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c8,_u64)(op1, op2, 2);
 }
@@ -782,7 +787,7 @@ svcount_t test_svwhilels_c8_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c8_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilels_c8_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c8,_u64)(op1, op2, 4);
 }
@@ -797,7 +802,7 @@ svcount_t test_svwhilels_c8_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c16_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilels_c16_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c16,_u64)(op1, op2, 2);
 }
@@ -812,7 +817,7 @@ svcount_t test_svwhilels_c16_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c16_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilels_c16_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c16,_u64)(op1, op2, 4);
 }
@@ -827,7 +832,7 @@ svcount_t test_svwhilels_c16_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c32_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilels_c32_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c32,_u64)(op1, op2, 2);
 }
@@ -842,7 +847,7 @@ svcount_t test_svwhilels_c32_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c32_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilels_c32_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c32,_u64)(op1, op2, 4);
 }
@@ -857,7 +862,7 @@ svcount_t test_svwhilels_c32_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c64_vl2(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilels_c64_vl2(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c64,_u64)(op1, op2, 2);
 }
@@ -872,7 +877,7 @@ svcount_t test_svwhilels_c64_vl2(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c64_vl4(uint64_t op1, uint64_t op2)
+svcount_t test_svwhilels_c64_vl4(uint64_t op1, uint64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilele_c64,_u64)(op1, op2, 4);
 }
@@ -890,7 +895,7 @@ svcount_t test_svwhilels_c64_vl4(uint64_t op1, uint64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c8_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilelt_c8_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c8,_s64)(op1, op2, 2);
 }
@@ -905,7 +910,7 @@ svcount_t test_svwhilelt_c8_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c8_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilelt_c8_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c8,_s64)(op1, op2, 4);
 }
@@ -920,7 +925,7 @@ svcount_t test_svwhilelt_c8_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c16_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilelt_c16_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c16,_s64)(op1, op2, 2);
 }
@@ -935,7 +940,7 @@ svcount_t test_svwhilelt_c16_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c16_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilelt_c16_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c16,_s64)(op1, op2, 4);
 }
@@ -950,7 +955,7 @@ svcount_t test_svwhilelt_c16_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c32_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilelt_c32_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c32,_s64)(op1, op2, 2);
 }
@@ -965,7 +970,7 @@ svcount_t test_svwhilelt_c32_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c32_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilelt_c32_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c32,_s64)(op1, op2, 4);
 }
@@ -980,7 +985,7 @@ svcount_t test_svwhilelt_c32_vl4(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c64_vl2(int64_t op1, int64_t op2)
+svcount_t test_svwhilelt_c64_vl2(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c64,_s64)(op1, op2, 2);
 }
@@ -995,7 +1000,7 @@ svcount_t test_svwhilelt_c64_vl2(int64_t op1, int64_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c64_vl4(int64_t op1, int64_t op2)
+svcount_t test_svwhilelt_c64_vl4(int64_t op1, int64_t op2) ATTR
 {
   return SVE_ACLE_FUNC(svwhilelt_c64,_s64)(op1, op2, 4);
 }
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 6a6370bf99b10..5118f743174c2 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -237,7 +237,7 @@ void test_svluti4_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __
   svluti4_lane_zt_f32_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
 }
 
-void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming_compatible {
+void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming {
   svbfmlslb_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
   svbfmlslt_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
diff --git a/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c b/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c
new file mode 100644
index 0000000000000..4debc14190aa8
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c
@@ -0,0 +1,37 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+#include "arm_sve.h"
+
+//svldnt1:
+
+__attribute__((target("+sme2")))
+svuint8x2_t sme2_or_sve2p1_intrinsic_test_sme2_invalid(svcount_t png, const uint8_t *rn) {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}}
+  return svldnt1_u8_x2(png, rn);
+}
+
+__attribute__((target("+sme2")))
+svint16x4_t sme2_or_sve2p1_intrinsic_test_sme2(svcount_t png, const int16_t *rn) __arm_streaming {
+  // expected-no-warning
+  return svldnt1_s16_x4(png, rn);
+}
+
+__attribute__((target("+sve2p1")))
+svuint32x2_t sme2_or_sve2p1_intrinsic_test_sve2p1(svcount_t png, const uint32_t *rn) {
+  // expected-no-warning
+  return svldnt1_u32_x2(png, rn);
+}
+
+__attribute__((target("+sme2,+sve2p1")))
+svint64x4_t sme2_or_sve2p1_intrinsic_test_both_arm_streaming(svcount_t png, const int64_t *rn) __arm_streaming {
+  // expected-no-warning
+  return svldnt1_s64_x4(png, rn);
+}
+
+__attribute__((target("+sme2,+sve2p1")))
+svint64x4_t sme2_or_sve2p1_intrinsic_test_both_no_arm_streaming(svcount_t png, const int64_t *rn) {
+  // expected-no-warning
+  return svldnt1_s64_x4(png, rn);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 6c302da106a2c..5de2223e71b04 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1773,11 +1773,14 @@ void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
   llvm::StringMap<std::set<std::string>> StreamingMap;
 
   uint64_t IsStreamingFlag = getEnumValueForFlag("IsStreaming");
+  uint64_t IsStreamingOrSVE2p1Flag = getEnumValueForFlag("IsStreamingOrSVE2p1");
   uint64_t IsStreamingCompatibleFlag =
       getEnumValueForFlag("IsStreamingCompatible");
   for (auto &Def : Defs) {
     if (Def->isFlagSet(IsStreamingFlag))
       StreamingMap["ArmStreaming"].insert(Def->getMangledName());
+    else if (Def->isFlagSet(IsStreamingOrSVE2p1Flag))
+      StreamingMap["ArmStreamingOrSVE2p1"].insert(Def->getMangledName());
     else if (Def->isFlagSet(IsStreamingCompatibleFlag))
       StreamingMap["ArmStreamingCompatible"].insert(Def->getMangledName());
     else

From a7a78fd427569a7ad8a27e682a66fe414f004a35 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Thu, 4 Jan 2024 16:53:14 +0000
Subject: [PATCH 249/313] Revert "[Clang][SME] Add IsStreamingOrSVE2p1"
 (#76973)

Reverts llvm/llvm-project#75958

I mistakenly included a commit from my local main after rebasing.
---
 clang/include/clang/Basic/arm_sve.td          | 123 ++++++++--------
 clang/include/clang/Basic/arm_sve_sme_incl.td |   1 -
 clang/lib/Basic/Targets/AArch64.h             |   1 -
 clang/lib/Sema/SemaChecking.cpp               |  17 +--
 .../aarch64-sve2-intrinsics/acle_sve2_revd.c  |   4 +-
 .../acle_sve2p1_bfmlsl.c                      |   8 +-
 .../acle_sve2p1_cntp.c                        |  27 ++--
 .../acle_sve2p1_fclamp.c                      |  14 +-
 .../acle_sve2p1_ld1.c                         |   2 -
 .../acle_sve2p1_pext.c                        |   5 +-
 .../acle_sve2p1_pfalse.c                      |  14 +-
 .../acle_sve2p1_psel.c                        |  26 +---
 .../acle_sve2p1_sclamp.c                      |  20 +--
 .../acle_sve2p1_stnt1.c                       |   2 +
 .../acle_sve2p1_uclamp.c                      |  20 +--
 .../acle_sve2p1_while_pn.c                    | 137 +++++++++---------
 .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp |   2 +-
 .../Sema/aarch64-sme2-sve2p1-diagnostics.c    |  37 -----
 clang/utils/TableGen/SveEmitter.cpp           |   3 -
 19 files changed, 174 insertions(+), 289 deletions(-)
 delete mode 100644 clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 17ee82d9304ff..91f62c4c76339 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1296,9 +1296,9 @@ def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd",  "b", MergeNone, "", [IsT
 def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate]>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
-  def SVCREATE_2_B : SInst<"svcreate2[_{d}]", "2dd",   "Pc", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
-  def SVCREATE_4_B : SInst<"svcreate4[_{d}]", "4dddd", "Pc", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
+let TargetGuard = "sve2p1" in {
+  def SVCREATE_2_B : SInst<"svcreate2[_{d}]", "2dd",   "Pc", MergeNone, "", [IsTupleCreate]>;
+  def SVCREATE_4_B : SInst<"svcreate4[_{d}]", "4dddd", "Pc", MergeNone, "", [IsTupleCreate]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1321,7 +1321,7 @@ def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet
 def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
+let TargetGuard = "sve2p1" in {
   def SVGET_2_B : SInst<"svget2[_{d}]", "d2i", "Pc", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>;
   def SVGET_4_B : SInst<"svget4[_{d}]", "d4i", "Pc", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>;
 
@@ -1976,37 +1976,39 @@ def SVFMINQV: SInst<"svminqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminq
 }
 
 let TargetGuard = "sve2p1|sme2" in {
-def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [IsStreamingOrSVE2p1], [ImmCheck<1, ImmCheck0_3>]>;
-def SVPEXT_X2     : SInst<"svpext_lane_{d}_x2", "2.P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext_x2", [IsStreamingOrSVE2p1], [ImmCheck<1, ImmCheck0_1>]>;
+//FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available
+def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>;
+def SVPEXT_X2     : SInst<"svpext_lane_{d}_x2", "2.P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>;
 
-def SVWHILEGE_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilege_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILEGT_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilegt_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILELE_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilele_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILELT_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilelt_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILELO_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilelo_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILELS_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilels_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILEHI_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehi_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
-def SVWHILEHS_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehs_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILEGE_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilege_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILEGT_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilegt_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILELE_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilele_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILELT_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilelt_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILELO_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilelo_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILELS_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilels_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILEHI_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehi_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+def SVWHILEHS_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehs_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
 }
 
 multiclass MultiVecLoad<string i> {
-  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "cUc",   [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "iUif",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "lUld",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "cUc",   [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "iUif",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "lUld",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-
-  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "cUc",   [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "iUif",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "lUld",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "cUc",   [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "iUif",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "lUld",  [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  // FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available (SME2 requires __arm_streaming)
+  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+
+  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
 }
 
 let TargetGuard = "sve2p1|sme2" in {
@@ -2015,23 +2017,24 @@ let TargetGuard = "sve2p1|sme2" in {
 }
 
 multiclass MultiVecStore<string i> {
-  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "cUc",   [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "iUif",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "lUld",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "cUc",   [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "iUif",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "lUld",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-
-  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "cUc",   [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "iUif",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "lUld",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
-  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "cUc",   [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "iUif",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
-  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "lUld",  [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  // FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available (SME2 requires __arm_streaming)
+  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+
+  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
 }
 
 let TargetGuard = "sve2p1|sme2" in {
@@ -2048,20 +2051,21 @@ def SVDOT_LANE_X2_U : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "Ui", MergeNone
 def SVDOT_LANE_X2_F : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "f",  MergeNone, "aarch64_sve_fdot_lane_x2", [], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
-let TargetGuard = "sve2p1|sme2" in {
-def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sclamp", [IsStreamingOrSVE2p1], []>;
-def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [IsStreamingOrSVE2p1], []>;
+let TargetGuard = "sve2p1|sme" in {
+def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sclamp", [], []>;
+def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [], []>;
 
 defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUlbhfd", "aarch64_sve_revd">;
 }
 
 let TargetGuard = "sve2p1|sme2" in {
-  def SVPTRUE_COUNT  : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_sve_ptrue_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], []>;
+  //FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available
+  def SVPTRUE_COUNT  : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_sve_ptrue_{d}", [IsOverloadNone, IsStreamingCompatible], []>;
 
-  def SVPFALSE_COUNT_ALIAS : SInst<"svpfalse_c", "}v", "", MergeNone, "", [IsOverloadNone, IsStreamingOrSVE2p1]>;
+  def SVPFALSE_COUNT_ALIAS : SInst<"svpfalse_c", "}v", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>;
 
-  def SVFCLAMP   : SInst<"svclamp[_{d}]", "dddd", "hfd", MergeNone, "aarch64_sve_fclamp", [IsStreamingOrSVE2p1], []>;
-  def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<1, ImmCheck2_4_Mul2>]>;
+  def SVFCLAMP   : SInst<"svclamp[_{d}]", "dddd", "hfd", MergeNone, "aarch64_sve_fclamp", [IsStreamingCompatible], []>;
+  def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<1, ImmCheck2_4_Mul2>]>;
 }
 
 let TargetGuard = "(sve2|sme2),b16b16" in {
@@ -2322,9 +2326,10 @@ let TargetGuard = "sme2" in {
 
 let TargetGuard = "sve2p1|sme2" in {
 // == BFloat16 multiply-subtract ==
-  def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, IsStreamingOrSVE2p1], []>;
-  def SVBFMLSLT : SInst<"svbfmlslt[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslt", [IsOverloadNone, IsStreamingOrSVE2p1], []>;
+// FIXME: Make all of these IsStreamingOrSVE2p1 once that is added
+  def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, IsStreamingCompatible], []>;
+  def SVBFMLSLT : SInst<"svbfmlslt[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslt", [IsOverloadNone, IsStreamingCompatible], []>;
 
-  def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, IsStreamingOrSVE2p1], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>;
 }
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index ad29864440c96..0dba8493bad2d 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -227,7 +227,6 @@ def IsPreservesZA             : FlagType<0x10000000000>;
 def IsReadZA                  : FlagType<0x20000000000>;
 def IsWriteZA                 : FlagType<0x40000000000>;
 def IsReductionQV             : FlagType<0x80000000000>;
-def IsStreamingOrSVE2p1       : FlagType<0x80000000000>; // Use for intrinsics that are common between sme/sme2 and sve2p1.
 
 // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
 class ImmCheckType<int val> {
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index e38fa5af56598..f0e0782e7abe9 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -50,7 +50,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasMatMul = false;
   bool HasBFloat16 = false;
   bool HasSVE2 = false;
-  bool HasSVE2p1 = false;
   bool HasSVE2AES = false;
   bool HasSVE2SHA3 = false;
   bool HasSVE2SM4 = false;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f13164dc06386..3168d38dd66c3 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2998,12 +2998,7 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context,
   llvm_unreachable("Invalid NeonTypeFlag!");
 }
 
-enum ArmStreamingType {
-  ArmNonStreaming,
-  ArmStreaming,
-  ArmStreamingCompatible,
-  ArmStreamingOrSVE2p1
-};
+enum ArmStreamingType { ArmNonStreaming, ArmStreaming, ArmStreamingCompatible };
 
 bool Sema::ParseSVEImmChecks(
     CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 3> &ImmChecks) {
@@ -3161,16 +3156,6 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
                                      const FunctionDecl *FD,
                                      ArmStreamingType BuiltinType) {
   ArmStreamingType FnType = getArmStreamingFnType(FD);
-  if (BuiltinType == ArmStreamingOrSVE2p1) {
-    // Check intrinsics that are available in [sve2p1 or sme/sme2].
-    llvm::StringMap<bool> CallerFeatureMap;
-    S.Context.getFunctionFeatureMap(CallerFeatureMap, FD);
-    if (Builtin::evaluateRequiredTargetFeatures("sve2p1", CallerFeatureMap))
-      BuiltinType = ArmStreamingCompatible;
-    else
-      BuiltinType = ArmStreaming;
-  }
-
   if (FnType == ArmStreaming && BuiltinType == ArmNonStreaming) {
     S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin)
         << TheCall->getSourceRange() << "streaming";
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
index d82d69442b8ff..74a90583a173a 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
index 22d951c069bc8..c1d14e16ad17b 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
@@ -2,20 +2,20 @@
 // REQUIRES: aarch64-registered-target
 
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -target-feature -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
 #ifndef TEST_SME2
 #define ATTR
 #else
-#define ATTR __arm_streaming
+#define ATTR __arm_streaming_compatible
 #endif
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
index 9bf55eaa6a08b..56b1d99262214 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
@@ -3,19 +3,10 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
-#ifndef TEST_SME2
-#define ATTR
-#else
-#define ATTR __arm_streaming
-#endif
-
 // CHECK-LABEL: @test_svcntp_c8_vlx2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 2)
@@ -26,7 +17,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c8_vlx2(svcount_t pnn) ATTR {
+uint64_t test_svcntp_c8_vlx2(svcount_t pnn) {
   return svcntp_c8(pnn, 2);
 }
 
@@ -40,7 +31,7 @@ uint64_t test_svcntp_c8_vlx2(svcount_t pnn) ATTR {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c8_vlx4(svcount_t pnn) ATTR {
+uint64_t test_svcntp_c8_vlx4(svcount_t pnn) {
   return svcntp_c8(pnn, 4);
 }
 
@@ -54,7 +45,7 @@ uint64_t test_svcntp_c8_vlx4(svcount_t pnn) ATTR {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c16_vlx2(svcount_t pnn) ATTR {
+uint64_t test_svcntp_c16_vlx2(svcount_t pnn) {
   return svcntp_c16(pnn, 2);
 }
 
@@ -68,7 +59,7 @@ uint64_t test_svcntp_c16_vlx2(svcount_t pnn) ATTR {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c16_vlx4(svcount_t pnn) ATTR {
+uint64_t test_svcntp_c16_vlx4(svcount_t pnn) {
   return svcntp_c16(pnn, 4);
 }
 
@@ -82,7 +73,7 @@ uint64_t test_svcntp_c16_vlx4(svcount_t pnn) ATTR {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c32_vlx2(svcount_t pnn) ATTR {
+uint64_t test_svcntp_c32_vlx2(svcount_t pnn) {
   return svcntp_c32(pnn, 2);
 }
 
@@ -96,7 +87,7 @@ uint64_t test_svcntp_c32_vlx2(svcount_t pnn) ATTR {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c32_vlx4(svcount_t pnn) ATTR {
+uint64_t test_svcntp_c32_vlx4(svcount_t pnn) {
   return svcntp_c32(pnn, 4);
 }
 
@@ -110,7 +101,7 @@ uint64_t test_svcntp_c32_vlx4(svcount_t pnn) ATTR {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c64_vlx2(svcount_t pnn) ATTR {
+uint64_t test_svcntp_c64_vlx2(svcount_t pnn) {
   return svcntp_c64(pnn, 2);
 }
 
@@ -124,6 +115,6 @@ uint64_t test_svcntp_c64_vlx2(svcount_t pnn) ATTR {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret i64 [[TMP0]]
 //
-uint64_t test_svcntp_c64_vlx4(svcount_t pnn) ATTR {
+uint64_t test_svcntp_c64_vlx4(svcount_t pnn) {
   return svcntp_c64(pnn, 4);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
index 7687257701a6e..5d8c5b7b8a18c 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
@@ -11,16 +11,10 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
-// RUN:   -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
-#ifndef TEST_SME2
-#define ATTR
-#else
-#define ATTR __arm_streaming
-#endif
-
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
@@ -38,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fclamp.nxv8f16(<vscale x 8 x half> [[OP1:%.*]], <vscale x 8 x half> [[OP2:%.*]], <vscale x 8 x half> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svclamp_f16(svfloat16_t op1, svfloat16_t op2, svfloat16_t op3) ATTR {
+svfloat16_t test_svclamp_f16(svfloat16_t op1, svfloat16_t op2, svfloat16_t op3) {
   return SVE_ACLE_FUNC(svclamp, _f16, , )(op1, op2, op3);
 }
 
@@ -52,7 +46,7 @@ svfloat16_t test_svclamp_f16(svfloat16_t op1, svfloat16_t op2, svfloat16_t op3)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fclamp.nxv4f32(<vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x float> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svclamp_f32(svfloat32_t op1, svfloat32_t op2, svfloat32_t op3) ATTR {
+svfloat32_t test_svclamp_f32(svfloat32_t op1, svfloat32_t op2, svfloat32_t op3) {
   return SVE_ACLE_FUNC(svclamp, _f32, , )(op1, op2, op3);
 }
 
@@ -66,7 +60,7 @@ svfloat32_t test_svclamp_f32(svfloat32_t op1, svfloat32_t op2, svfloat32_t op3)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fclamp.nxv2f64(<vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x double> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svclamp_f64(svfloat64_t op1, svfloat64_t op2, svfloat64_t op3) ATTR {
+svfloat64_t test_svclamp_f64(svfloat64_t op1, svfloat64_t op2, svfloat64_t op3) {
   return SVE_ACLE_FUNC(svclamp, _f64, , )(op1, op2, op3);
 }
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
index 7657165d8b3f6..6f1231e776aa3 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
@@ -1,8 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wno-unknown-attributes -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
index 8f08b32618b05..a3206029019c3 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
@@ -1,11 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -DTEST_SME2 -target-feature +sve -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
index afdb038fb9312..19993e5418128 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
@@ -1,20 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
-#ifndef TEST_SME2
-#define ATTR
-#else
-#define ATTR __arm_streaming
-#endif
-
 // CHECK-LABEL: @test_svpfalse_c(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> zeroinitializer)
@@ -25,7 +19,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> zeroinitializer)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svpfalse_c(void) ATTR
+svcount_t test_svpfalse_c(void) __arm_streaming_compatible
 {
   return svpfalse_c();
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
index de3f6a9a57bfe..73b7b0347dd97 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
@@ -10,19 +10,9 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
-#ifndef TEST_SME2
-#define ATTR
-#else
-#define ATTR __arm_streaming_compatible
-#endif
-
 // CHECK-LABEL: @test_svpsel_lane_b8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 15
@@ -35,7 +25,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv16i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 16 x i1> [[P2:%.*]], i32 [[ADD]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
 //
-svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
+svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
   return svpsel_lane_b8(p1, p2, idx + 15);
 }
 
@@ -53,7 +43,7 @@ svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 8 x i1> [[TMP0]], i32 [[ADD]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
+svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
   return svpsel_lane_b16(p1, p2, idx + 7);
 }
 
@@ -71,7 +61,7 @@ svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 4 x i1> [[TMP0]], i32 [[ADD]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
+svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
   return svpsel_lane_b32(p1, p2, idx + 3);
 }
 
@@ -89,7 +79,7 @@ svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 2 x i1> [[TMP0]], i32 [[ADD]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
+svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
   return svpsel_lane_b64(p1, p2, idx + 1);
 }
 
@@ -109,7 +99,7 @@ svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) ATTR {
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP1]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP2]]
 //
-svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
+svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
   return svpsel_lane_c8(p1, p2, idx + 15);
 }
 
@@ -131,7 +121,7 @@ svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
 //
-svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
+svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
   return svpsel_lane_c16(p1, p2, idx + 7);
 }
 
@@ -153,7 +143,7 @@ svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
 //
-svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
+svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
   return svpsel_lane_c32(p1, p2, idx + 3);
 }
 
@@ -175,6 +165,6 @@ svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
 //
-svcount_t test_svpsel_lane_c64(svcount_t p1, svbool_t p2, uint32_t idx) ATTR {
+svcount_t test_svpsel_lane_c64(svcount_t p1, svbool_t p2, uint32_t idx) __arm_streaming_compatible {
   return svpsel_lane_c64(p1, p2, idx + 1);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
index 04869fd550ec1..8c63a7455c79f 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
@@ -10,21 +10,9 @@
 // RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
-#ifndef TEST_SME2
-#define ATTR
-#else
-#define ATTR __arm_streaming
-#endif
-
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
@@ -42,7 +30,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sclamp.nxv16i8(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) ATTR  {
+svint8_t test_svclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) {
   return SVE_ACLE_FUNC(svclamp, _s8, , )(op1, op2, op3);
 }
 
@@ -56,7 +44,7 @@ svint8_t test_svclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) ATTR  {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sclamp.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) ATTR  {
+svint16_t test_svclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) {
   return SVE_ACLE_FUNC(svclamp, _s16, , )(op1, op2, op3);
 }
 
@@ -70,7 +58,7 @@ svint16_t test_svclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) ATTR  {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sclamp.nxv4i32(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) ATTR  {
+svint32_t test_svclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) {
   return SVE_ACLE_FUNC(svclamp, _s32, , )(op1, op2, op3);
 }
 
@@ -84,7 +72,7 @@ svint32_t test_svclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) ATTR  {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sclamp.nxv2i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svclamp_s64(svint64_t op1, svint64_t op2, svint64_t op3) ATTR  {
+svint64_t test_svclamp_s64(svint64_t op1, svint64_t op2, svint64_t op3) {
   return SVE_ACLE_FUNC(svclamp, _s64, , )(op1, op2, op3);
 }
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
index b1ca27b7b68a1..0d8696a7634a7 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
@@ -505,9 +505,11 @@ void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) ATTR
   return SVE_ACLE_FUNC(svstnt1,_f64_x4,,)(pn, base, v);
 }
 
+
 // == VNUM variants ==
 
 
+
 // CHECK-LABEL: @test_svstnt1_vnum_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
index 37bfd4265a43a..b878986248877 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
@@ -10,21 +10,9 @@
 // RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
-#ifndef TEST_SME2
-#define ATTR
-#else
-#define ATTR __arm_streaming
-#endif
-
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
@@ -42,7 +30,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uclamp.nxv16i8(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) ATTR  {
+svuint8_t test_svclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) {
   return SVE_ACLE_FUNC(svclamp, _u8, , )(op1, op2, op3);
 }
 
@@ -56,7 +44,7 @@ svuint8_t test_svclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) ATTR  {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uclamp.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) ATTR  {
+svuint16_t test_svclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) {
   return SVE_ACLE_FUNC(svclamp, _u16, , )(op1, op2, op3);
 }
 
@@ -70,7 +58,7 @@ svuint16_t test_svclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uclamp.nxv4i32(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) ATTR  {
+svuint32_t test_svclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) {
   return SVE_ACLE_FUNC(svclamp, _u32, , )(op1, op2, op3);
 }
 
@@ -84,7 +72,7 @@ svuint32_t test_svclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uclamp.nxv2i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svclamp_u64(svuint64_t op1, svuint64_t op2, svuint64_t op3) ATTR  {
+svuint64_t test_svclamp_u64(svuint64_t op1, svuint64_t op2, svuint64_t op3) {
   return SVE_ACLE_FUNC(svclamp, _u64, , )(op1, op2, op3);
 }
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
index 11ebec9e7cbf1..143a43b4a9219 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -16,11 +16,6 @@
 #define SVE_ACLE_FUNC(A1, A2) A1##A2
 #endif
 
-#ifdef TEST_SME2
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
 
 // WHILEGE
 
@@ -34,7 +29,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c8_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilege_c8_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c8,_s64)(op1, op2, 2);
 }
@@ -49,7 +44,7 @@ svcount_t test_svwhilege_c8_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c8_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilege_c8_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c8,_s64)(op1, op2, 4);
 }
@@ -64,7 +59,7 @@ svcount_t test_svwhilege_c8_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c16_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilege_c16_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c16,_s64)(op1, op2, 2);
 }
@@ -79,7 +74,7 @@ svcount_t test_svwhilege_c16_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c16_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilege_c16_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c16,_s64)(op1, op2, 4);
 }
@@ -94,7 +89,7 @@ svcount_t test_svwhilege_c16_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c32_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilege_c32_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c32,_s64)(op1, op2, 2);
 }
@@ -109,7 +104,7 @@ svcount_t test_svwhilege_c32_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c32_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilege_c32_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c32,_s64)(op1, op2, 4);
 }
@@ -124,7 +119,7 @@ svcount_t test_svwhilege_c32_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c64_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilege_c64_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c64,_s64)(op1, op2, 2);
 }
@@ -139,7 +134,7 @@ svcount_t test_svwhilege_c64_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilege_c64_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilege_c64_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c64,_s64)(op1, op2, 4);
 }
@@ -157,7 +152,7 @@ svcount_t test_svwhilege_c64_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c8_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilegt_c8_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c8,_s64)(op1, op2, 2);
 }
@@ -172,7 +167,7 @@ svcount_t test_svwhilegt_c8_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c8_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilegt_c8_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c8,_s64)(op1, op2, 4);
 }
@@ -187,7 +182,7 @@ svcount_t test_svwhilegt_c8_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c16_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilegt_c16_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c16,_s64)(op1, op2, 2);
 }
@@ -202,7 +197,7 @@ svcount_t test_svwhilegt_c16_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c16_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilegt_c16_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c16,_s64)(op1, op2, 4);
 }
@@ -217,7 +212,7 @@ svcount_t test_svwhilegt_c16_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c32_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilegt_c32_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c32,_s64)(op1, op2, 2);
 }
@@ -232,7 +227,7 @@ svcount_t test_svwhilegt_c32_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c32_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilegt_c32_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c32,_s64)(op1, op2, 4);
 }
@@ -247,7 +242,7 @@ svcount_t test_svwhilegt_c32_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c64_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilegt_c64_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c64,_s64)(op1, op2, 2);
 }
@@ -262,7 +257,7 @@ svcount_t test_svwhilegt_c64_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilegt_c64_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilegt_c64_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c64,_s64)(op1, op2, 4);
 }
@@ -280,7 +275,7 @@ svcount_t test_svwhilegt_c64_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c8_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehi_c8_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c8,_u64)(op1, op2, 2);
 }
@@ -295,7 +290,7 @@ svcount_t test_svwhilehi_c8_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c8_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehi_c8_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c8,_u64)(op1, op2, 4);
 }
@@ -310,7 +305,7 @@ svcount_t test_svwhilehi_c8_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c16_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehi_c16_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c16,_u64)(op1, op2, 2);
 }
@@ -325,7 +320,7 @@ svcount_t test_svwhilehi_c16_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c16_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehi_c16_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c16,_u64)(op1, op2, 4);
 }
@@ -340,7 +335,7 @@ svcount_t test_svwhilehi_c16_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c32_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehi_c32_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c32,_u64)(op1, op2, 2);
 }
@@ -355,7 +350,7 @@ svcount_t test_svwhilehi_c32_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c32_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehi_c32_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c32,_u64)(op1, op2, 4);
 }
@@ -370,7 +365,7 @@ svcount_t test_svwhilehi_c32_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c64_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehi_c64_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c64,_u64)(op1, op2, 2);
 }
@@ -385,7 +380,7 @@ svcount_t test_svwhilehi_c64_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehi_c64_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehi_c64_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilegt_c64,_u64)(op1, op2, 4);
 }
@@ -403,7 +398,7 @@ svcount_t test_svwhilehi_c64_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c8_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehs_c8_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c8,_u64)(op1, op2, 2);
 }
@@ -418,7 +413,7 @@ svcount_t test_svwhilehs_c8_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c8_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehs_c8_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c8,_u64)(op1, op2, 4);
 }
@@ -433,7 +428,7 @@ svcount_t test_svwhilehs_c8_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c16_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehs_c16_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c16,_u64)(op1, op2, 2);
 }
@@ -448,7 +443,7 @@ svcount_t test_svwhilehs_c16_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c16_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehs_c16_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c16,_u64)(op1, op2, 4);
 }
@@ -463,7 +458,7 @@ svcount_t test_svwhilehs_c16_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c32_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehs_c32_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c32,_u64)(op1, op2, 2);
 }
@@ -478,7 +473,7 @@ svcount_t test_svwhilehs_c32_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c32_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehs_c32_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c32,_u64)(op1, op2, 4);
 }
@@ -493,7 +488,7 @@ svcount_t test_svwhilehs_c32_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c64_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehs_c64_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c64,_u64)(op1, op2, 2);
 }
@@ -508,7 +503,7 @@ svcount_t test_svwhilehs_c64_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilehs_c64_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilehs_c64_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilege_c64,_u64)(op1, op2, 4);
 }
@@ -526,7 +521,7 @@ svcount_t test_svwhilehs_c64_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c8_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilele_c8_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c8,_s64)(op1, op2, 2);
 }
@@ -541,7 +536,7 @@ svcount_t test_svwhilele_c8_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c8_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilele_c8_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c8,_s64)(op1, op2, 4);
 }
@@ -556,7 +551,7 @@ svcount_t test_svwhilele_c8_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c16_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilele_c16_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c16,_s64)(op1, op2, 2);
 }
@@ -571,7 +566,7 @@ svcount_t test_svwhilele_c16_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c16_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilele_c16_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c16,_s64)(op1, op2, 4);
 }
@@ -586,7 +581,7 @@ svcount_t test_svwhilele_c16_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c32_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilele_c32_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c32,_s64)(op1, op2, 2);
 }
@@ -601,7 +596,7 @@ svcount_t test_svwhilele_c32_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c32_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilele_c32_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c32,_s64)(op1, op2, 4);
 }
@@ -616,7 +611,7 @@ svcount_t test_svwhilele_c32_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c64_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilele_c64_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c64,_s64)(op1, op2, 2);
 }
@@ -631,7 +626,7 @@ svcount_t test_svwhilele_c64_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilele_c64_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilele_c64_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c64,_s64)(op1, op2, 4);
 }
@@ -649,7 +644,7 @@ svcount_t test_svwhilele_c64_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c8_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilelo_c8_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c8,_u64)(op1, op2, 2);
 }
@@ -664,7 +659,7 @@ svcount_t test_svwhilelo_c8_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c8_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilelo_c8_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c8,_u64)(op1, op2, 4);
 }
@@ -679,7 +674,7 @@ svcount_t test_svwhilelo_c8_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c16_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilelo_c16_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c16,_u64)(op1, op2, 2);
 }
@@ -694,7 +689,7 @@ svcount_t test_svwhilelo_c16_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c16_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilelo_c16_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c16,_u64)(op1, op2, 4);
 }
@@ -709,7 +704,7 @@ svcount_t test_svwhilelo_c16_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c32_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilelo_c32_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c32,_u64)(op1, op2, 2);
 }
@@ -724,7 +719,7 @@ svcount_t test_svwhilelo_c32_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c32_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilelo_c32_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c32,_u64)(op1, op2, 4);
 }
@@ -739,7 +734,7 @@ svcount_t test_svwhilelo_c32_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c64_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilelo_c64_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c64,_u64)(op1, op2, 2);
 }
@@ -754,7 +749,7 @@ svcount_t test_svwhilelo_c64_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelo_c64_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilelo_c64_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c64,_u64)(op1, op2, 4);
 }
@@ -772,7 +767,7 @@ svcount_t test_svwhilelo_c64_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c8_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilels_c8_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c8,_u64)(op1, op2, 2);
 }
@@ -787,7 +782,7 @@ svcount_t test_svwhilels_c8_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c8_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilels_c8_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c8,_u64)(op1, op2, 4);
 }
@@ -802,7 +797,7 @@ svcount_t test_svwhilels_c8_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c16_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilels_c16_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c16,_u64)(op1, op2, 2);
 }
@@ -817,7 +812,7 @@ svcount_t test_svwhilels_c16_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c16_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilels_c16_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c16,_u64)(op1, op2, 4);
 }
@@ -832,7 +827,7 @@ svcount_t test_svwhilels_c16_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c32_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilels_c32_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c32,_u64)(op1, op2, 2);
 }
@@ -847,7 +842,7 @@ svcount_t test_svwhilels_c32_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c32_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilels_c32_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c32,_u64)(op1, op2, 4);
 }
@@ -862,7 +857,7 @@ svcount_t test_svwhilels_c32_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c64_vl2(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilels_c64_vl2(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c64,_u64)(op1, op2, 2);
 }
@@ -877,7 +872,7 @@ svcount_t test_svwhilels_c64_vl2(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilels_c64_vl4(uint64_t op1, uint64_t op2) ATTR
+svcount_t test_svwhilels_c64_vl4(uint64_t op1, uint64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilele_c64,_u64)(op1, op2, 4);
 }
@@ -895,7 +890,7 @@ svcount_t test_svwhilels_c64_vl4(uint64_t op1, uint64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c8_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilelt_c8_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c8,_s64)(op1, op2, 2);
 }
@@ -910,7 +905,7 @@ svcount_t test_svwhilelt_c8_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c8_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilelt_c8_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c8,_s64)(op1, op2, 4);
 }
@@ -925,7 +920,7 @@ svcount_t test_svwhilelt_c8_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c16_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilelt_c16_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c16,_s64)(op1, op2, 2);
 }
@@ -940,7 +935,7 @@ svcount_t test_svwhilelt_c16_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c16_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilelt_c16_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c16,_s64)(op1, op2, 4);
 }
@@ -955,7 +950,7 @@ svcount_t test_svwhilelt_c16_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c32_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilelt_c32_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c32,_s64)(op1, op2, 2);
 }
@@ -970,7 +965,7 @@ svcount_t test_svwhilelt_c32_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c32_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilelt_c32_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c32,_s64)(op1, op2, 4);
 }
@@ -985,7 +980,7 @@ svcount_t test_svwhilelt_c32_vl4(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c64_vl2(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilelt_c64_vl2(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c64,_s64)(op1, op2, 2);
 }
@@ -1000,7 +995,7 @@ svcount_t test_svwhilelt_c64_vl2(int64_t op1, int64_t op2) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4)
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svwhilelt_c64_vl4(int64_t op1, int64_t op2) ATTR
+svcount_t test_svwhilelt_c64_vl4(int64_t op1, int64_t op2)
 {
   return SVE_ACLE_FUNC(svwhilelt_c64,_s64)(op1, op2, 4);
 }
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 5118f743174c2..6a6370bf99b10 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -237,7 +237,7 @@ void test_svluti4_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __
   svluti4_lane_zt_f32_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
 }
 
-void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming {
+void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming_compatible {
   svbfmlslb_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
   svbfmlslt_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
diff --git a/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c b/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c
deleted file mode 100644
index 4debc14190aa8..0000000000000
--- a/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s
-
-// REQUIRES: aarch64-registered-target
-#include "arm_sve.h"
-
-//svldnt1:
-
-__attribute__((target("+sme2")))
-svuint8x2_t sme2_or_sve2p1_intrinsic_test_sme2_invalid(svcount_t png, const uint8_t *rn) {
-  // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}}
-  return svldnt1_u8_x2(png, rn);
-}
-
-__attribute__((target("+sme2")))
-svint16x4_t sme2_or_sve2p1_intrinsic_test_sme2(svcount_t png, const int16_t *rn) __arm_streaming {
-  // expected-no-warning
-  return svldnt1_s16_x4(png, rn);
-}
-
-__attribute__((target("+sve2p1")))
-svuint32x2_t sme2_or_sve2p1_intrinsic_test_sve2p1(svcount_t png, const uint32_t *rn) {
-  // expected-no-warning
-  return svldnt1_u32_x2(png, rn);
-}
-
-__attribute__((target("+sme2,+sve2p1")))
-svint64x4_t sme2_or_sve2p1_intrinsic_test_both_arm_streaming(svcount_t png, const int64_t *rn) __arm_streaming {
-  // expected-no-warning
-  return svldnt1_s64_x4(png, rn);
-}
-
-__attribute__((target("+sme2,+sve2p1")))
-svint64x4_t sme2_or_sve2p1_intrinsic_test_both_no_arm_streaming(svcount_t png, const int64_t *rn) {
-  // expected-no-warning
-  return svldnt1_s64_x4(png, rn);
-}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 5de2223e71b04..6c302da106a2c 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1773,14 +1773,11 @@ void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
   llvm::StringMap<std::set<std::string>> StreamingMap;
 
   uint64_t IsStreamingFlag = getEnumValueForFlag("IsStreaming");
-  uint64_t IsStreamingOrSVE2p1Flag = getEnumValueForFlag("IsStreamingOrSVE2p1");
   uint64_t IsStreamingCompatibleFlag =
       getEnumValueForFlag("IsStreamingCompatible");
   for (auto &Def : Defs) {
     if (Def->isFlagSet(IsStreamingFlag))
       StreamingMap["ArmStreaming"].insert(Def->getMangledName());
-    else if (Def->isFlagSet(IsStreamingOrSVE2p1Flag))
-      StreamingMap["ArmStreamingOrSVE2p1"].insert(Def->getMangledName());
     else if (Def->isFlagSet(IsStreamingCompatibleFlag))
       StreamingMap["ArmStreamingCompatible"].insert(Def->getMangledName());
     else

From 917b404e2ccdcc31d2d64971ad094b80967a240b Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl@users.noreply.github.com>
Date: Thu, 4 Jan 2024 09:04:05 -0800
Subject: [PATCH 250/313] Add support for inline DWARF source files. (#75880)

LLVM supports DWARF 5 linetable extension to store source files inline
in DWARF. This is particularly useful for compiler-generated source
code. This implementation tries to materialize them as temporary files
lazily, so SBAPI clients don't need to be aware of them.

rdar://110926168
---
 lldb/include/lldb/Symbol/CompileUnit.h        |  23 ++--
 lldb/include/lldb/Symbol/SymbolFile.h         |   2 +-
 lldb/include/lldb/Symbol/SymbolFileOnDemand.h |   2 +-
 lldb/include/lldb/Utility/FileSpecList.h      | 106 ++++++++++++-----
 lldb/source/API/SBCompileUnit.cpp             |   2 +-
 lldb/source/Commands/CommandObjectSource.cpp  |   2 +-
 lldb/source/Core/ModuleList.cpp               |   2 +-
 .../Clang/ClangUserExpression.cpp             |  12 +-
 .../Clang/CppModuleConfiguration.cpp          |   6 +-
 .../Breakpad/SymbolFileBreakpad.cpp           |   5 +-
 .../SymbolFile/Breakpad/SymbolFileBreakpad.h  |   3 +-
 .../Plugins/SymbolFile/CTF/SymbolFileCTF.h    |   2 +-
 .../SymbolFile/DWARF/SymbolFileDWARF.cpp      | 107 ++++++++++++------
 .../SymbolFile/DWARF/SymbolFileDWARF.h        |   9 +-
 .../DWARF/SymbolFileDWARFDebugMap.cpp         |   4 +-
 .../DWARF/SymbolFileDWARFDebugMap.h           |   2 +-
 .../Plugins/SymbolFile/JSON/SymbolFileJSON.h  |   2 +-
 .../NativePDB/SymbolFileNativePDB.cpp         |   4 +-
 .../NativePDB/SymbolFileNativePDB.h           |   2 +-
 .../Plugins/SymbolFile/PDB/SymbolFilePDB.cpp  |   2 +-
 .../Plugins/SymbolFile/PDB/SymbolFilePDB.h    |   2 +-
 .../SymbolFile/Symtab/SymbolFileSymtab.cpp    |   2 +-
 .../SymbolFile/Symtab/SymbolFileSymtab.h      |   2 +-
 lldb/source/Symbol/CompileUnit.cpp            |  13 +--
 lldb/source/Symbol/SymbolFileOnDemand.cpp     |   2 +-
 lldb/source/Utility/FileSpecList.cpp          |  58 ++++++++--
 .../inline-sourcefile/Makefile                |  11 ++
 .../TestInlineSourceFiles.py                  |  15 +++
 .../inline-sourcefile/inline.ll               |  39 +++++++
 .../functionalities/inline-sourcefile/main.c  |   7 ++
 lldb/unittests/Core/FileSpecListTest.cpp      |   8 +-
 31 files changed, 335 insertions(+), 123 deletions(-)
 create mode 100644 lldb/test/API/functionalities/inline-sourcefile/Makefile
 create mode 100644 lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
 create mode 100644 lldb/test/API/functionalities/inline-sourcefile/inline.ll
 create mode 100644 lldb/test/API/functionalities/inline-sourcefile/main.c

diff --git a/lldb/include/lldb/Symbol/CompileUnit.h b/lldb/include/lldb/Symbol/CompileUnit.h
index 93f191b499858..89e853ab599d0 100644
--- a/lldb/include/lldb/Symbol/CompileUnit.h
+++ b/lldb/include/lldb/Symbol/CompileUnit.h
@@ -112,10 +112,13 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
   ///     the compile unit is optimized will be made when
   ///     CompileUnit::GetIsOptimized() is called.
   ///
+  /// \param[in] support_files
+  ///     An rvalue list of already parsed support files.
   /// \see lldb::LanguageType
   CompileUnit(const lldb::ModuleSP &module_sp, void *user_data,
               const FileSpec &file_spec, lldb::user_id_t uid,
-              lldb::LanguageType language, lldb_private::LazyBool is_optimized);
+              lldb::LanguageType language, lldb_private::LazyBool is_optimized,
+              SupportFileList &&support_files = {});
 
   /// Add a function to this compile unit.
   ///
@@ -226,6 +229,9 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
   /// Return the primary source file associated with this compile unit.
   const FileSpec &GetPrimaryFile() const { return m_file_spec; }
 
+  /// Return the primary source file associated with this compile unit.
+  void SetPrimaryFile(const FileSpec &fs) { m_file_spec = fs; }
+
   /// Get the line table for the compile unit.
   ///
   /// Called by clients and the SymbolFile plug-in. The SymbolFile plug-ins
@@ -265,7 +271,13 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
   ///
   /// \return
   ///     A support file list object.
-  const FileSpecList &GetSupportFiles();
+  const SupportFileList &GetSupportFiles();
+
+  /// Used by plugins that parse the support file list.
+  SupportFileList &GetSupportFileList() {
+    m_flags.Set(flagsParsedSupportFiles);
+    return m_support_files;
+  }
 
   /// Get the compile unit's imported module list.
   ///
@@ -331,8 +343,6 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
   ///     A line table object pointer that this object now owns.
   void SetLineTable(LineTable *line_table);
 
-  void SetSupportFiles(FileSpecList support_files);
-
   void SetDebugMacros(const DebugMacrosSP &debug_macros);
 
   /// Set accessor for the variable list.
@@ -410,9 +420,8 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
   std::vector<SourceModule> m_imported_modules;
   /// The primary file associated with this compile unit.
   FileSpec m_file_spec;
-  /// Files associated with this compile unit's line table and
-  /// declarations.
-  FileSpecList m_support_files;
+  /// Files associated with this compile unit's line table and declarations.
+  SupportFileList m_support_files;
   /// Line table that will get parsed on demand.
   std::unique_ptr<LineTable> m_line_table_up;
   /// Debug macros that will get parsed on demand.
diff --git a/lldb/include/lldb/Symbol/SymbolFile.h b/lldb/include/lldb/Symbol/SymbolFile.h
index c9a2a647a039d..f356f7b789fa3 100644
--- a/lldb/include/lldb/Symbol/SymbolFile.h
+++ b/lldb/include/lldb/Symbol/SymbolFile.h
@@ -197,7 +197,7 @@ class SymbolFile : public PluginInterface {
     return false;
   }
   virtual bool ParseSupportFiles(CompileUnit &comp_unit,
-                                 FileSpecList &support_files) = 0;
+                                 SupportFileList &support_files) = 0;
   virtual size_t ParseTypes(CompileUnit &comp_unit) = 0;
   virtual bool ParseIsOptimized(CompileUnit &comp_unit) { return false; }
 
diff --git a/lldb/include/lldb/Symbol/SymbolFileOnDemand.h b/lldb/include/lldb/Symbol/SymbolFileOnDemand.h
index cde9f3c3b8ce1..4e3009941aa7d 100644
--- a/lldb/include/lldb/Symbol/SymbolFileOnDemand.h
+++ b/lldb/include/lldb/Symbol/SymbolFileOnDemand.h
@@ -81,7 +81,7 @@ class SymbolFileOnDemand : public lldb_private::SymbolFile {
       llvm::function_ref<bool(lldb_private::Module &)>) override;
 
   bool ParseSupportFiles(lldb_private::CompileUnit &comp_unit,
-                         lldb_private::FileSpecList &support_files) override;
+                         lldb_private::SupportFileList &support_files) override;
 
   bool ParseIsOptimized(lldb_private::CompileUnit &comp_unit) override;
 
diff --git a/lldb/include/lldb/Utility/FileSpecList.h b/lldb/include/lldb/Utility/FileSpecList.h
index 77587aa917916..8cccb19499991 100644
--- a/lldb/include/lldb/Utility/FileSpecList.h
+++ b/lldb/include/lldb/Utility/FileSpecList.h
@@ -17,6 +17,86 @@
 namespace lldb_private {
 class Stream;
 
+/// Wraps either a FileSpec that represents a local file or a source
+/// file whose contents is known (for example because it can be
+/// reconstructed from debug info), but that hasn't been written to a
+/// file yet.
+class SupportFile {
+protected:
+  FileSpec m_file_spec;
+
+public:
+  SupportFile(const FileSpec &spec) : m_file_spec(spec) {}
+  SupportFile(const SupportFile &other) = delete;
+  SupportFile(SupportFile &&other) = default;
+  virtual ~SupportFile() = default;
+  bool operator==(const SupportFile &other) {
+    return m_file_spec == other.m_file_spec;
+  }
+  /// Return the file name only. Useful for resolving breakpoints by file name.
+  const FileSpec &GetSpecOnly() const { return m_file_spec; };
+  /// Materialize the file to disk and return the path to that temporary file.
+  virtual const FileSpec &Materialize() { return m_file_spec; }
+};
+
+/// A list of support files for a CompileUnit.
+class SupportFileList {
+public:
+  SupportFileList(){};
+  SupportFileList(const SupportFileList &) = delete;
+  SupportFileList(SupportFileList &&other) = default;
+
+  typedef std::vector<std::unique_ptr<SupportFile>> collection;
+  typedef collection::const_iterator const_iterator;
+  const_iterator begin() const { return m_files.begin(); }
+  const_iterator end() const { return m_files.end(); }
+
+  void Append(const FileSpec &file) {
+    return Append(std::make_unique<SupportFile>(file));
+  }
+  void Append(std::unique_ptr<SupportFile> &&file) {
+    m_files.push_back(std::move(file));
+  }
+  // FIXME: Only used by SymbolFilePDB. Replace with a DenseSet at call site.
+  bool AppendIfUnique(const FileSpec &file);
+  size_t GetSize() const { return m_files.size(); }
+  const FileSpec &GetFileSpecAtIndex(size_t idx) const;
+  size_t FindFileIndex(size_t idx, const FileSpec &file, bool full) const;
+  /// Find a compatible file index.
+  ///
+  /// Find the index of a compatible file in the file spec list that matches \a
+  /// file starting \a idx entries into the file spec list. A file is considered
+  /// compatible if:
+  /// - The file matches exactly (only filename if \a file has no directory)
+  /// - If \a file is relative and any file in the list has this same suffix
+  /// - If any file in the list is relative and the relative path is a suffix
+  ///   of \a file
+  ///
+  /// This is used to implement better matching for setting breakpoints in
+  /// source files where an IDE might specify a full path when setting the
+  /// breakpoint and debug info contains relative paths, if a user specifies
+  /// a relative path when setting a breakpoint.
+  ///
+  /// \param[in] idx
+  ///     An index into the file list.
+  ///
+  /// \param[in] file
+  ///     The file specification to search for.
+  ///
+  /// \return
+  ///     The index of the file that matches \a file if it is found,
+  ///     else UINT32_MAX is returned.
+  size_t FindCompatibleIndex(size_t idx, const FileSpec &file) const;
+
+  template <class... Args> void EmplaceBack(Args &&...args) {
+    m_files.push_back(
+        std::make_unique<SupportFile>(FileSpec(std::forward<Args>(args)...)));
+  }
+
+protected:
+  collection m_files; ///< A collection of FileSpec objects.
+};
+
 /// \class FileSpecList FileSpecList.h "lldb/Utility/FileSpecList.h"
 /// A file collection class.
 ///
@@ -114,32 +194,6 @@ class FileSpecList {
   ///     else UINT32_MAX is returned.
   size_t FindFileIndex(size_t idx, const FileSpec &file, bool full) const;
 
-  /// Find a compatible file index.
-  ///
-  /// Find the index of a compatible file in the file spec list that matches \a
-  /// file starting \a idx entries into the file spec list. A file is considered
-  /// compatible if:
-  /// - The file matches exactly (only filename if \a file has no directory)
-  /// - If \a file is relative and any file in the list has this same suffix
-  /// - If any file in the list is relative and the relative path is a suffix
-  ///   of \a file
-  ///
-  /// This is used to implement better matching for setting breakpoints in
-  /// source files where an IDE might specify a full path when setting the
-  /// breakpoint and debug info contains relative paths, if a user specifies
-  /// a relative path when setting a breakpoint.
-  ///
-  /// \param[in] idx
-  ///     An index into the file list.
-  ///
-  /// \param[in] file
-  ///     The file specification to search for.
-  ///
-  /// \return
-  ///     The index of the file that matches \a file if it is found,
-  ///     else UINT32_MAX is returned.
-  size_t FindCompatibleIndex(size_t idx, const FileSpec &file) const;
-
   /// Get file at index.
   ///
   /// Gets a file from the file list. If \a idx is not a valid index, an empty
diff --git a/lldb/source/API/SBCompileUnit.cpp b/lldb/source/API/SBCompileUnit.cpp
index 3aa65e225d7ab..65fdb11032b9c 100644
--- a/lldb/source/API/SBCompileUnit.cpp
+++ b/lldb/source/API/SBCompileUnit.cpp
@@ -171,7 +171,7 @@ uint32_t SBCompileUnit::FindSupportFileIndex(uint32_t start_idx,
   LLDB_INSTRUMENT_VA(this, start_idx, sb_file, full);
 
   if (m_opaque_ptr) {
-    const FileSpecList &support_files = m_opaque_ptr->GetSupportFiles();
+    const SupportFileList &support_files = m_opaque_ptr->GetSupportFiles();
     return support_files.FindFileIndex(start_idx, sb_file.ref(), full);
   }
   return 0;
diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp
index db158a7f52630..cabf6f0436f17 100644
--- a/lldb/source/Commands/CommandObjectSource.cpp
+++ b/lldb/source/Commands/CommandObjectSource.cpp
@@ -204,7 +204,7 @@ class CommandObjectSourceInfo : public CommandObjectParsed {
     if (cu) {
       assert(file_spec.GetFilename().AsCString());
       bool has_path = (file_spec.GetDirectory().AsCString() != nullptr);
-      const FileSpecList &cu_file_list = cu->GetSupportFiles();
+      const SupportFileList &cu_file_list = cu->GetSupportFiles();
       size_t file_idx = cu_file_list.FindFileIndex(0, file_spec, has_path);
       if (file_idx != UINT32_MAX) {
         // Update the file to how it appears in the CU.
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index aa89c93c8d052..2180f29f36942 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -164,7 +164,7 @@ void ModuleListProperties::UpdateSymlinkMappings() {
   llvm::sys::ScopedWriter lock(m_symlink_paths_mutex);
   const bool notify = false;
   m_symlink_paths.Clear(notify);
-  for (FileSpec symlink : list) {
+  for (auto symlink : list) {
     FileSpec resolved;
     Status status = FileSystem::Instance().Readlink(symlink, resolved);
     if (status.Success())
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index 68bdd96e8adb0..30bc81c9ed8c1 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -488,18 +488,18 @@ CppModuleConfiguration GetModuleConfig(lldb::LanguageType language,
 
   // Build a list of files we need to analyze to build the configuration.
   FileSpecList files;
-  for (const FileSpec &f : sc.comp_unit->GetSupportFiles())
-    files.AppendIfUnique(f);
+  for (auto &f : sc.comp_unit->GetSupportFiles())
+    files.AppendIfUnique(f->Materialize());
   // We also need to look at external modules in the case of -gmodules as they
   // contain the support files for libc++ and the C library.
   llvm::DenseSet<SymbolFile *> visited_symbol_files;
   sc.comp_unit->ForEachExternalModule(
       visited_symbol_files, [&files](Module &module) {
         for (std::size_t i = 0; i < module.GetNumCompileUnits(); ++i) {
-          const FileSpecList &support_files =
+          const SupportFileList &support_files =
               module.GetCompileUnitAtIndex(i)->GetSupportFiles();
-          for (const FileSpec &f : support_files) {
-            files.AppendIfUnique(f);
+          for (auto &f : support_files) {
+            files.AppendIfUnique(f->Materialize());
           }
         }
         return false;
@@ -508,7 +508,7 @@ CppModuleConfiguration GetModuleConfig(lldb::LanguageType language,
   LLDB_LOG(log, "[C++ module config] Found {0} support files to analyze",
            files.GetSize());
   if (log && log->GetVerbose()) {
-    for (const FileSpec &f : files)
+    for (auto &f : files)
       LLDB_LOGV(log, "[C++ module config] Analyzing support file: {0}",
                 f.GetPath());
   }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/CppModuleConfiguration.cpp b/lldb/source/Plugins/ExpressionParser/Clang/CppModuleConfiguration.cpp
index 62443d1290dc7..f43a04488230f 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/CppModuleConfiguration.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/CppModuleConfiguration.cpp
@@ -134,9 +134,9 @@ bool CppModuleConfiguration::hasValidConfig() {
 CppModuleConfiguration::CppModuleConfiguration(
     const FileSpecList &support_files, const llvm::Triple &triple) {
   // Analyze all files we were given to build the configuration.
-  bool error = !llvm::all_of(support_files,
-                             std::bind(&CppModuleConfiguration::analyzeFile,
-                                       this, std::placeholders::_1, triple));
+  bool error = !llvm::all_of(support_files, [&](auto &file) {
+    return CppModuleConfiguration::analyzeFile(file, triple);
+  });
   // If we have a valid configuration at this point, set the
   // include directories and module list that should be used.
   if (!error && hasValidConfig()) {
diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
index 729d6af02402d..47c8074adc5b7 100644
--- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
+++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
@@ -278,13 +278,14 @@ bool SymbolFileBreakpad::ParseLineTable(CompileUnit &comp_unit) {
 }
 
 bool SymbolFileBreakpad::ParseSupportFiles(CompileUnit &comp_unit,
-                                           FileSpecList &support_files) {
+                                           SupportFileList &support_files) {
   std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
   CompUnitData &data = m_cu_data->GetEntryRef(comp_unit.GetID()).data;
   if (!data.support_files)
     ParseLineTableAndSupportFiles(comp_unit, data);
 
-  support_files = std::move(*data.support_files);
+  for (auto &fs : *data.support_files)
+    support_files.Append(fs);
   return true;
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
index 214fbdd3ff3aa..41e4e3b258014 100644
--- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
+++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
@@ -73,7 +73,7 @@ class SymbolFileBreakpad : public SymbolFileCommon {
   bool ParseDebugMacros(CompileUnit &comp_unit) override { return false; }
 
   bool ParseSupportFiles(CompileUnit &comp_unit,
-                         FileSpecList &support_files) override;
+                         SupportFileList &support_files) override;
   size_t ParseTypes(CompileUnit &cu) override { return 0; }
 
   bool ParseImportedModules(
@@ -195,7 +195,6 @@ class SymbolFileBreakpad : public SymbolFileCommon {
     Bookmark bookmark;
     std::optional<FileSpecList> support_files;
     std::unique_ptr<LineTable> line_table_up;
-
   };
 
   uint32_t CalculateNumCompileUnits() override;
diff --git a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.h b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.h
index f111937dbd6ef..3a80138fffbc3 100644
--- a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.h
+++ b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.h
@@ -66,7 +66,7 @@ class SymbolFileCTF : public lldb_private::SymbolFileCommon {
   bool ParseDebugMacros(CompileUnit &comp_unit) override { return false; }
 
   bool ParseSupportFiles(CompileUnit &comp_unit,
-                         FileSpecList &support_files) override {
+                         SupportFileList &support_files) override {
     return false;
   }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 505ea29ca4d4f..447930ffe07b3 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Threading.h"
 
@@ -209,17 +210,14 @@ GetFileByIndex(const llvm::DWARFDebugLine::Prologue &prologue, size_t idx,
   return std::move(rel_path);
 }
 
-static FileSpecList
-ParseSupportFilesFromPrologue(const lldb::ModuleSP &module,
-                              const llvm::DWARFDebugLine::Prologue &prologue,
-                              FileSpec::Style style,
-                              llvm::StringRef compile_dir = {}) {
-  FileSpecList support_files;
-
+static void ParseSupportFilesFromPrologue(
+    SupportFileList &support_files, const lldb::ModuleSP &module,
+    const llvm::DWARFDebugLine::Prologue &prologue, FileSpec::Style style,
+    llvm::StringRef compile_dir = {}) {
   // Handle the case where there are no files first to avoid having to special
   // case this later.
   if (prologue.FileNames.empty())
-    return support_files;
+    return;
 
   // Before DWARF v5, the line table indexes were one based.
   const bool is_one_based = prologue.getVersion() < 5;
@@ -235,6 +233,53 @@ ParseSupportFilesFromPrologue(const lldb::ModuleSP &module,
   for (size_t idx = first_file_idx; idx <= last_file_idx; ++idx) {
     std::string remapped_file;
     if (auto file_path = GetFileByIndex(prologue, idx, compile_dir, style)) {
+      auto entry = prologue.getFileNameEntry(idx);
+      auto source = entry.Source.getAsCString();
+      if (!source)
+        consumeError(source.takeError());
+      else {
+        llvm::StringRef source_ref(*source);
+        if (!source_ref.empty()) {
+          /// Wrap a path for an in-DWARF source file. Lazily write it
+          /// to disk when Materialize() is called.
+          struct LazyDWARFSourceFile : public SupportFile {
+            LazyDWARFSourceFile(const FileSpec &fs, llvm::StringRef source,
+                                FileSpec::Style style)
+                : SupportFile(fs), source(source), style(style) {}
+            FileSpec tmp_file;
+            /// The file contents buffer.
+            llvm::StringRef source;
+            /// Deletes the temporary file at the end.
+            std::unique_ptr<llvm::FileRemover> remover;
+            FileSpec::Style style;
+
+            /// Write the file contents to a temporary file.
+            const FileSpec &Materialize() override {
+              if (tmp_file)
+                return tmp_file;
+              llvm::SmallString<0> name;
+              int fd;
+              auto orig_name = m_file_spec.GetFilename().GetStringRef();
+              auto ec = llvm::sys::fs::createTemporaryFile(
+                  "", llvm::sys::path::filename(orig_name, style), fd, name);
+              if (ec || fd <= 0) {
+                LLDB_LOG(GetLog(DWARFLog::DebugInfo),
+                         "Could not create temporary file");
+                return tmp_file;
+              }
+              remover = std::make_unique<llvm::FileRemover>(name);
+              NativeFile file(fd, File::eOpenOptionWriteOnly, true);
+              size_t num_bytes = source.size();
+              file.Write(source.data(), num_bytes);
+              tmp_file.SetPath(name);
+              return tmp_file;
+            }
+          };
+          support_files.Append(std::make_unique<LazyDWARFSourceFile>(
+              FileSpec(*file_path), *source, style));
+          continue;
+        }
+      }
       if (auto remapped = module->RemapSourceFile(llvm::StringRef(*file_path)))
         remapped_file = *remapped;
       else
@@ -251,8 +296,6 @@ ParseSupportFilesFromPrologue(const lldb::ModuleSP &module,
     // Unconditionally add an entry, so the indices match up.
     support_files.EmplaceBack(remapped_file, style, checksum);
   }
-
-  return support_files;
 }
 
 void SymbolFileDWARF::Initialize() {
@@ -744,12 +787,13 @@ lldb::CompUnitSP SymbolFileDWARF::ParseCompileUnit(DWARFCompileUnit &dwarf_cu) {
       ModuleSP module_sp(m_objfile_sp->GetModule());
       if (module_sp) {
         auto initialize_cu = [&](const FileSpec &file_spec,
-                                 LanguageType cu_language) {
+                                 LanguageType cu_language,
+                                 SupportFileList &&support_files = {}) {
           BuildCuTranslationTable();
           cu_sp = std::make_shared<CompileUnit>(
               module_sp, &dwarf_cu, file_spec,
               *GetDWARFUnitIndex(dwarf_cu.GetID()), cu_language,
-              eLazyBoolCalculate);
+              eLazyBoolCalculate, std::move(support_files));
 
           dwarf_cu.SetUserData(cu_sp.get());
 
@@ -775,15 +819,13 @@ lldb::CompUnitSP SymbolFileDWARF::ParseCompileUnit(DWARFCompileUnit &dwarf_cu) {
           // file is also the name of the compile unit. This
           // allows us to avoid loading the non-skeleton unit,
           // which may be in a separate DWO file.
-          FileSpecList support_files;
+          SupportFileList support_files;
           if (!ParseSupportFiles(dwarf_cu, module_sp, support_files))
             return false;
           if (support_files.GetSize() == 0)
             return false;
-
           initialize_cu(support_files.GetFileSpecAtIndex(0),
-                        eLanguageTypeUnknown);
-          cu_sp->SetSupportFiles(std::move(support_files));
+                        eLanguageTypeUnknown, std::move(support_files));
           return true;
         };
 
@@ -1029,7 +1071,7 @@ bool SymbolFileDWARF::ForEachExternalModule(
 }
 
 bool SymbolFileDWARF::ParseSupportFiles(CompileUnit &comp_unit,
-                                        FileSpecList &support_files) {
+                                        SupportFileList &support_files) {
   std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
   DWARFUnit *dwarf_cu = GetDWARFCompileUnit(&comp_unit);
   if (!dwarf_cu)
@@ -1038,13 +1080,12 @@ bool SymbolFileDWARF::ParseSupportFiles(CompileUnit &comp_unit,
   if (!ParseSupportFiles(*dwarf_cu, comp_unit.GetModule(), support_files))
     return false;
 
-  comp_unit.SetSupportFiles(support_files);
   return true;
 }
 
 bool SymbolFileDWARF::ParseSupportFiles(DWARFUnit &dwarf_cu,
                                         const ModuleSP &module,
-                                        FileSpecList &support_files) {
+                                        SupportFileList &support_files) {
 
   dw_offset_t offset = dwarf_cu.GetLineTableOffset();
   if (offset == DW_INVALID_OFFSET)
@@ -1057,8 +1098,8 @@ bool SymbolFileDWARF::ParseSupportFiles(DWARFUnit &dwarf_cu,
     return false;
 
   std::string comp_dir = dwarf_cu.GetCompilationDirectory().GetPath();
-  support_files = ParseSupportFilesFromPrologue(
-      module, prologue, dwarf_cu.GetPathStyle(), comp_dir);
+  ParseSupportFilesFromPrologue(support_files, module, prologue,
+                                dwarf_cu.GetPathStyle(), comp_dir);
   return true;
 }
 
@@ -1070,24 +1111,27 @@ FileSpec SymbolFileDWARF::GetFile(DWARFUnit &unit, size_t file_idx) {
   }
 
   auto &tu = llvm::cast<DWARFTypeUnit>(unit);
-  return GetTypeUnitSupportFiles(tu).GetFileSpecAtIndex(file_idx);
+  if (const SupportFileList *support_files = GetTypeUnitSupportFiles(tu))
+    return support_files->GetFileSpecAtIndex(file_idx);
+  return {};
 }
 
-const FileSpecList &
+const SupportFileList *
 SymbolFileDWARF::GetTypeUnitSupportFiles(DWARFTypeUnit &tu) {
-  static FileSpecList empty_list;
+  static SupportFileList empty_list;
 
   dw_offset_t offset = tu.GetLineTableOffset();
   if (offset == DW_INVALID_OFFSET ||
       offset == llvm::DenseMapInfo<dw_offset_t>::getEmptyKey() ||
       offset == llvm::DenseMapInfo<dw_offset_t>::getTombstoneKey())
-    return empty_list;
+    return nullptr;
 
   // Many type units can share a line table, so parse the support file list
   // once, and cache it based on the offset field.
   auto iter_bool = m_type_unit_support_files.try_emplace(offset);
-  FileSpecList &list = iter_bool.first->second;
+  std::unique_ptr<SupportFileList> &list = iter_bool.first->second;
   if (iter_bool.second) {
+    list = std::make_unique<SupportFileList>();
     uint64_t line_table_offset = offset;
     llvm::DWARFDataExtractor data =
         m_context.getOrLoadLineData().GetAsLLVMDWARF();
@@ -1101,14 +1145,13 @@ SymbolFileDWARF::GetTypeUnitSupportFiles(DWARFTypeUnit &tu) {
     };
     ElapsedTime elapsed(m_parse_time);
     llvm::Error error = prologue.parse(data, &line_table_offset, report, ctx);
-    if (error) {
+    if (error)
       report(std::move(error));
-    } else {
-      list = ParseSupportFilesFromPrologue(GetObjectFile()->GetModule(),
-                                           prologue, tu.GetPathStyle());
-    }
+    else
+      ParseSupportFilesFromPrologue(*list, GetObjectFile()->GetModule(),
+                                    prologue, tu.GetPathStyle());
   }
-  return list;
+  return list.get();
 }
 
 bool SymbolFileDWARF::ParseIsOptimized(CompileUnit &comp_unit) {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 78819edd0062b..26a9502f90aa0 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -123,7 +123,7 @@ class SymbolFileDWARF : public SymbolFileCommon {
                              llvm::function_ref<bool(Module &)>) override;
 
   bool ParseSupportFiles(CompileUnit &comp_unit,
-                         FileSpecList &support_files) override;
+                         SupportFileList &support_files) override;
 
   bool ParseIsOptimized(CompileUnit &comp_unit) override;
 
@@ -396,7 +396,7 @@ class SymbolFileDWARF : public SymbolFileCommon {
                          bool *type_is_new);
 
   bool ParseSupportFiles(DWARFUnit &dwarf_cu, const lldb::ModuleSP &module,
-                         FileSpecList &support_files);
+                         SupportFileList &support_files);
 
   lldb::VariableSP ParseVariableDIE(const SymbolContext &sc,
                                     const DWARFDIE &die,
@@ -489,7 +489,7 @@ class SymbolFileDWARF : public SymbolFileCommon {
 
   void FindDwpSymbolFile();
 
-  const FileSpecList &GetTypeUnitSupportFiles(DWARFTypeUnit &tu);
+  const SupportFileList *GetTypeUnitSupportFiles(DWARFTypeUnit &tu);
 
   void InitializeFirstCodeAddressRecursive(const SectionList &section_list);
 
@@ -529,7 +529,8 @@ class SymbolFileDWARF : public SymbolFileCommon {
   DIEToVariableSP m_die_to_variable_sp;
   DIEToCompilerType m_forward_decl_die_to_compiler_type;
   CompilerTypeToDIE m_forward_decl_compiler_type_to_die;
-  llvm::DenseMap<dw_offset_t, FileSpecList> m_type_unit_support_files;
+  llvm::DenseMap<dw_offset_t, std::unique_ptr<SupportFileList>>
+      m_type_unit_support_files;
   std::vector<uint32_t> m_lldb_cu_to_dwarf_unit;
   /// DWARF does not provide a good way for traditional (concatenating) linkers
   /// to invalidate debug info describing dead-stripped code. These linkers will
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
index e5b59460cb85d..9094a5e21e690 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
@@ -725,8 +725,8 @@ bool SymbolFileDWARFDebugMap::ForEachExternalModule(
   return false;
 }
 
-bool SymbolFileDWARFDebugMap::ParseSupportFiles(CompileUnit &comp_unit,
-                                                FileSpecList &support_files) {
+bool SymbolFileDWARFDebugMap::ParseSupportFiles(
+    CompileUnit &comp_unit, SupportFileList &support_files) {
   std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
   SymbolFileDWARF *oso_dwarf = GetSymbolFile(comp_unit);
   if (oso_dwarf)
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
index cd0a4bb6e41c2..d639ee500080d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
@@ -74,7 +74,7 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon {
                              llvm::function_ref<bool(Module &)>) override;
 
   bool ParseSupportFiles(CompileUnit &comp_unit,
-                         FileSpecList &support_files) override;
+                         SupportFileList &support_files) override;
 
   bool ParseIsOptimized(CompileUnit &comp_unit) override;
 
diff --git a/lldb/source/Plugins/SymbolFile/JSON/SymbolFileJSON.h b/lldb/source/Plugins/SymbolFile/JSON/SymbolFileJSON.h
index 4dd0d65da4658..3dd33b3dc82fb 100644
--- a/lldb/source/Plugins/SymbolFile/JSON/SymbolFileJSON.h
+++ b/lldb/source/Plugins/SymbolFile/JSON/SymbolFileJSON.h
@@ -59,7 +59,7 @@ class SymbolFileJSON : public lldb_private::SymbolFileCommon {
   bool ParseDebugMacros(CompileUnit &comp_unit) override { return false; }
 
   bool ParseSupportFiles(CompileUnit &comp_unit,
-                         FileSpecList &support_files) override {
+                         SupportFileList &support_files) override {
     return false;
   }
 
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index ad08013399369..8375010ae3ded 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -1369,7 +1369,7 @@ SymbolFileNativePDB::GetFileIndex(const CompilandIndexItem &cii,
 }
 
 bool SymbolFileNativePDB::ParseSupportFiles(CompileUnit &comp_unit,
-                                            FileSpecList &support_files) {
+                                            SupportFileList &support_files) {
   std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
   PdbSymUid cu_id(comp_unit.GetID());
   lldbassert(cu_id.kind() == PdbSymUidKind::Compiland);
@@ -1416,7 +1416,7 @@ void SymbolFileNativePDB::ParseInlineSite(PdbCompilandSymId id,
     return;
   InlineeSourceLine inlinee_line = iter->second;
 
-  const FileSpecList &files = comp_unit->GetSupportFiles();
+  const SupportFileList &files = comp_unit->GetSupportFiles();
   FileSpec decl_file;
   llvm::Expected<uint32_t> file_index_or_err =
       GetFileIndex(*cii, inlinee_line.Header->FileID);
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
index 9d0458cf7ebfe..82577771f355c 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
@@ -94,7 +94,7 @@ class SymbolFileNativePDB : public SymbolFileCommon {
   bool ParseDebugMacros(lldb_private::CompileUnit &comp_unit) override;
 
   bool ParseSupportFiles(lldb_private::CompileUnit &comp_unit,
-                         FileSpecList &support_files) override;
+                         SupportFileList &support_files) override;
   size_t ParseTypes(lldb_private::CompileUnit &comp_unit) override;
 
   bool ParseImportedModules(
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
index 9e1cd83606602..b26beecc6d126 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
@@ -365,7 +365,7 @@ bool SymbolFilePDB::ParseDebugMacros(CompileUnit &comp_unit) {
 }
 
 bool SymbolFilePDB::ParseSupportFiles(
-    CompileUnit &comp_unit, lldb_private::FileSpecList &support_files) {
+    CompileUnit &comp_unit, lldb_private::SupportFileList &support_files) {
 
   // In theory this is unnecessary work for us, because all of this information
   // is easily (and quickly) accessible from DebugInfoPDB, so caching it a
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.h b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.h
index 01851f1418f3a..ea495c575f1f1 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.h
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.h
@@ -70,7 +70,7 @@ class SymbolFilePDB : public lldb_private::SymbolFileCommon {
   bool ParseDebugMacros(lldb_private::CompileUnit &comp_unit) override;
 
   bool ParseSupportFiles(lldb_private::CompileUnit &comp_unit,
-                         lldb_private::FileSpecList &support_files) override;
+                         lldb_private::SupportFileList &support_files) override;
 
   size_t ParseTypes(lldb_private::CompileUnit &comp_unit) override;
 
diff --git a/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp b/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp
index 6e4c6439974e9..8c17017442b1f 100644
--- a/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp
+++ b/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp
@@ -211,7 +211,7 @@ bool SymbolFileSymtab::ParseDebugMacros(CompileUnit &comp_unit) {
 }
 
 bool SymbolFileSymtab::ParseSupportFiles(CompileUnit &comp_unit,
-                                         FileSpecList &support_files) {
+                                         SupportFileList &support_files) {
   return false;
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.h b/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.h
index 1bbc4de9c9425..a36311525334e 100644
--- a/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.h
+++ b/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.h
@@ -57,7 +57,7 @@ class SymbolFileSymtab : public lldb_private::SymbolFileCommon {
   bool ParseDebugMacros(lldb_private::CompileUnit &comp_unit) override;
 
   bool ParseSupportFiles(lldb_private::CompileUnit &comp_unit,
-                         lldb_private::FileSpecList &support_files) override;
+                         lldb_private::SupportFileList &support_files) override;
 
   size_t ParseTypes(lldb_private::CompileUnit &comp_unit) override;
 
diff --git a/lldb/source/Symbol/CompileUnit.cpp b/lldb/source/Symbol/CompileUnit.cpp
index c9796973940a2..a6b6c8e57eec0 100644
--- a/lldb/source/Symbol/CompileUnit.cpp
+++ b/lldb/source/Symbol/CompileUnit.cpp
@@ -28,10 +28,11 @@ CompileUnit::CompileUnit(const lldb::ModuleSP &module_sp, void *user_data,
 CompileUnit::CompileUnit(const lldb::ModuleSP &module_sp, void *user_data,
                          const FileSpec &fspec, const lldb::user_id_t cu_sym_id,
                          lldb::LanguageType language,
-                         lldb_private::LazyBool is_optimized)
+                         lldb_private::LazyBool is_optimized,
+                         SupportFileList &&support_files)
     : ModuleChild(module_sp), UserID(cu_sym_id), m_user_data(user_data),
       m_language(language), m_flags(0), m_file_spec(fspec),
-      m_is_optimized(is_optimized) {
+      m_support_files(std::move(support_files)), m_is_optimized(is_optimized) {
   if (language != eLanguageTypeUnknown)
     m_flags.Set(flagsParsedLanguage);
   assert(module_sp);
@@ -178,10 +179,6 @@ void CompileUnit::SetLineTable(LineTable *line_table) {
   m_line_table_up.reset(line_table);
 }
 
-void CompileUnit::SetSupportFiles(FileSpecList support_files) {
-  m_support_files = std::move(support_files);
-}
-
 DebugMacros *CompileUnit::GetDebugMacros() {
   if (m_debug_macros_sp.get() == nullptr) {
     if (m_flags.IsClear(flagsParsedDebugMacros)) {
@@ -213,7 +210,7 @@ VariableListSP CompileUnit::GetVariableList(bool can_create) {
   return m_variables;
 }
 
-std::vector<uint32_t> FindFileIndexes(const FileSpecList &files,
+std::vector<uint32_t> FindFileIndexes(const SupportFileList &files,
                                       const FileSpec &file) {
   std::vector<uint32_t> result;
   uint32_t idx = -1;
@@ -411,7 +408,7 @@ bool CompileUnit::ForEachExternalModule(
   return false;
 }
 
-const FileSpecList &CompileUnit::GetSupportFiles() {
+const SupportFileList &CompileUnit::GetSupportFiles() {
   if (m_support_files.GetSize() == 0) {
     if (m_flags.IsClear(flagsParsedSupportFiles)) {
       m_flags.Set(flagsParsedSupportFiles);
diff --git a/lldb/source/Symbol/SymbolFileOnDemand.cpp b/lldb/source/Symbol/SymbolFileOnDemand.cpp
index 33995252bfe2c..bdb1951d51259 100644
--- a/lldb/source/Symbol/SymbolFileOnDemand.cpp
+++ b/lldb/source/Symbol/SymbolFileOnDemand.cpp
@@ -115,7 +115,7 @@ bool SymbolFileOnDemand::ForEachExternalModule(
 }
 
 bool SymbolFileOnDemand::ParseSupportFiles(CompileUnit &comp_unit,
-                                           FileSpecList &support_files) {
+                                           SupportFileList &support_files) {
   LLDB_LOG(GetLog(),
            "[{0}] {1} is not skipped: explicitly allowed to support breakpoint",
            GetSymbolFileName(), __FUNCTION__);
diff --git a/lldb/source/Utility/FileSpecList.cpp b/lldb/source/Utility/FileSpecList.cpp
index e3d8ea650c75d..8d2cf81efe5b1 100644
--- a/lldb/source/Utility/FileSpecList.cpp
+++ b/lldb/source/Utility/FileSpecList.cpp
@@ -37,6 +37,19 @@ bool FileSpecList::AppendIfUnique(const FileSpec &file_spec) {
   return false;
 }
 
+// FIXME: Replace this with a DenseSet at the call site. It is inefficient.
+bool SupportFileList::AppendIfUnique(const FileSpec &file_spec) {
+  collection::iterator end = m_files.end();
+  if (find_if(m_files.begin(), end,
+              [&](const std::unique_ptr<SupportFile> &support_file) {
+                return support_file->GetSpecOnly() == file_spec;
+              }) == end) {
+    Append(file_spec);
+    return true;
+  }
+  return false;
+}
+
 // Clears the file list.
 void FileSpecList::Clear() { m_files.clear(); }
 
@@ -55,22 +68,22 @@ void FileSpecList::Dump(Stream *s, const char *separator_cstr) const {
 //
 // Returns the valid index of the file that matches "file_spec" if it is found,
 // else std::numeric_limits<uint32_t>::max() is returned.
-size_t FileSpecList::FindFileIndex(size_t start_idx, const FileSpec &file_spec,
-                                   bool full) const {
-  const size_t num_files = m_files.size();
-
+static size_t FindFileIndex(size_t start_idx, const FileSpec &file_spec,
+                            bool full, size_t num_files,
+                            std::function<const FileSpec &(size_t)> get_ith) {
   // When looking for files, we will compare only the filename if the FILE_SPEC
   // argument is empty
   bool compare_filename_only = file_spec.GetDirectory().IsEmpty();
 
   for (size_t idx = start_idx; idx < num_files; ++idx) {
+    const FileSpec &ith = get_ith(idx);
     if (compare_filename_only) {
-      if (ConstString::Equals(
-              m_files[idx].GetFilename(), file_spec.GetFilename(),
-              file_spec.IsCaseSensitive() || m_files[idx].IsCaseSensitive()))
+      if (ConstString::Equals(ith.GetFilename(), file_spec.GetFilename(),
+                              file_spec.IsCaseSensitive() ||
+                                  ith.IsCaseSensitive()))
         return idx;
     } else {
-      if (FileSpec::Equal(m_files[idx], file_spec, full))
+      if (FileSpec::Equal(ith, file_spec, full))
         return idx;
     }
   }
@@ -79,8 +92,24 @@ size_t FileSpecList::FindFileIndex(size_t start_idx, const FileSpec &file_spec,
   return UINT32_MAX;
 }
 
-size_t FileSpecList::FindCompatibleIndex(size_t start_idx,
-                                         const FileSpec &file_spec) const {
+size_t FileSpecList::FindFileIndex(size_t start_idx, const FileSpec &file_spec,
+                                   bool full) const {
+  return ::FindFileIndex(
+      start_idx, file_spec, full, m_files.size(),
+      [&](size_t idx) -> const FileSpec & { return m_files[idx]; });
+}
+
+size_t SupportFileList::FindFileIndex(size_t start_idx,
+                                      const FileSpec &file_spec,
+                                      bool full) const {
+  return ::FindFileIndex(start_idx, file_spec, full, m_files.size(),
+                         [&](size_t idx) -> const FileSpec & {
+                           return m_files[idx]->GetSpecOnly();
+                         });
+}
+
+size_t SupportFileList::FindCompatibleIndex(size_t start_idx,
+                                            const FileSpec &file_spec) const {
   const size_t num_files = m_files.size();
   if (start_idx >= num_files)
     return UINT32_MAX;
@@ -92,7 +121,7 @@ size_t FileSpecList::FindCompatibleIndex(size_t start_idx,
   const bool full = !file_spec.GetDirectory().IsEmpty();
 
   for (size_t idx = start_idx; idx < num_files; ++idx) {
-    const FileSpec &curr_file = m_files[idx];
+    const FileSpec &curr_file = m_files[idx]->GetSpecOnly();
 
     // Always start by matching the filename first
     if (!curr_file.FileEquals(file_spec))
@@ -140,6 +169,13 @@ const FileSpec &FileSpecList::GetFileSpecAtIndex(size_t idx) const {
   return g_empty_file_spec;
 }
 
+const FileSpec &SupportFileList::GetFileSpecAtIndex(size_t idx) const {
+  if (idx < m_files.size())
+    return m_files[idx]->Materialize();
+  static FileSpec g_empty_file_spec;
+  return g_empty_file_spec;
+}
+
 // Return the size in bytes that this object takes in memory. This returns the
 // size in bytes of this object's member variables and any FileSpec objects its
 // member variables contain, the result doesn't not include the string values
diff --git a/lldb/test/API/functionalities/inline-sourcefile/Makefile b/lldb/test/API/functionalities/inline-sourcefile/Makefile
new file mode 100644
index 0000000000000..adb29d3a88e26
--- /dev/null
+++ b/lldb/test/API/functionalities/inline-sourcefile/Makefile
@@ -0,0 +1,11 @@
+C_SOURCES := main.c
+CFLAGS_EXTRAS := -gdwarf-5
+
+include Makefile.rules
+
+OBJECTS += inline.o
+
+$(EXE): main.c inline.o
+
+%.o: %.ll
+	$(CC) $< -c -o $@
diff --git a/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py b/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
new file mode 100644
index 0000000000000..20ed0ce00661f
--- /dev/null
+++ b/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
@@ -0,0 +1,15 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbplatform
+from lldbsuite.test import lldbutil
+
+
+class InlineSourceFilesTestCase(TestBase):
+    @skipIf(compiler="gcc")
+    @skipIf(compiler="clang", compiler_version=["<", "18.0"])
+    def test(self):
+        """Test DWARF inline source files."""
+        self.build()
+        lldbutil.run_to_name_breakpoint(self, 'f')
+        self.expect("list f", substrs=["This is inline source code"])
diff --git a/lldb/test/API/functionalities/inline-sourcefile/inline.ll b/lldb/test/API/functionalities/inline-sourcefile/inline.ll
new file mode 100644
index 0000000000000..56194e45b8138
--- /dev/null
+++ b/lldb/test/API/functionalities/inline-sourcefile/inline.ll
@@ -0,0 +1,39 @@
+; ModuleID = '/tmp/t.c'
+source_filename = "/tmp/t.c"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: noinline nounwind optnone ssp uwtable(sync)
+define void @f() #0 !dbg !9 {
+entry:
+  call void @stop(), !dbg !13
+  ret void, !dbg !14
+}
+
+declare void @stop(...) #1
+
+attributes #0 = { noinline nounwind optnone ssp uwtable(sync) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 18.0.0git (git@github.com:llvm/llvm-project.git 29ee66f4a0967e43a035f147c960743c7b640f2f)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!1 = !DIFile(filename: "/INLINE/inlined.c", directory: "/Volumes/Data/llvm-project", checksumkind: CSK_MD5, checksum: "3183154a5cb31debe9a8e27ca500bc3c")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"uwtable", i32 1}
+!7 = !{i32 7, !"frame-pointer", i32 1}
+!8 = !{!"clang version 18.0.0git (git@github.com:llvm/llvm-project.git 29ee66f4a0967e43a035f147c960743c7b640f2f)"}
+!9 = distinct !DISubprogram(name: "f", scope: !10, file: !10, line: 2, type: !11, scopeLine: 2, spFlags: DISPFlagDefinition, unit: !0)
+!10 = !DIFile(filename: "/INLINE/inlined.c", directory: "", source: "void stop();
+void f() {
+  // This is inline source code.
+  stop(); // break here
+}
+")
+!11 = !DISubroutineType(types: !12)
+!12 = !{null}
+!13 = !DILocation(line: 4, column: 3, scope: !9)
+!14 = !DILocation(line: 5, column: 1, scope: !9)
diff --git a/lldb/test/API/functionalities/inline-sourcefile/main.c b/lldb/test/API/functionalities/inline-sourcefile/main.c
new file mode 100644
index 0000000000000..c030d7773fa70
--- /dev/null
+++ b/lldb/test/API/functionalities/inline-sourcefile/main.c
@@ -0,0 +1,7 @@
+void f();
+void stop() {}
+
+int main(int argc, char const *argv[]) {
+  f();
+  return 0;
+}
diff --git a/lldb/unittests/Core/FileSpecListTest.cpp b/lldb/unittests/Core/FileSpecListTest.cpp
index d65e7cd2d0586..e63f4a00bc3a9 100644
--- a/lldb/unittests/Core/FileSpecListTest.cpp
+++ b/lldb/unittests/Core/FileSpecListTest.cpp
@@ -20,7 +20,7 @@ static FileSpec WindowsSpec(llvm::StringRef path) {
   return FileSpec(path, FileSpec::Style::windows);
 }
 
-TEST(FileSpecListTest, RelativePathMatchesPosix) {
+TEST(SupportFileListTest, RelativePathMatchesPosix) {
 
   const FileSpec fullpath = PosixSpec("/build/src/main.cpp");
   const FileSpec relative = PosixSpec("./src/main.cpp");
@@ -32,7 +32,7 @@ TEST(FileSpecListTest, RelativePathMatchesPosix) {
   const FileSpec rel2_wrong = PosixSpec("asrc/main.cpp");
   const FileSpec rel3_wrong = PosixSpec("rc/main.cpp");
 
-  FileSpecList files;
+  SupportFileList files;
   files.Append(fullpath);
   files.Append(relative);
   files.Append(basename);
@@ -72,7 +72,7 @@ TEST(FileSpecListTest, RelativePathMatchesPosix) {
   EXPECT_EQ((size_t)6, files.FindCompatibleIndex(3, rel3_wrong));
 }
 
-TEST(FileSpecListTest, RelativePathMatchesWindows) {
+TEST(SupportFileListTest, RelativePathMatchesWindows) {
 
   const FileSpec fullpath = WindowsSpec(R"(C:\build\src\main.cpp)");
   const FileSpec relative = WindowsSpec(R"(.\src\main.cpp)");
@@ -84,7 +84,7 @@ TEST(FileSpecListTest, RelativePathMatchesWindows) {
   const FileSpec rel2_wrong = WindowsSpec(R"(asrc\main.cpp)");
   const FileSpec rel3_wrong = WindowsSpec(R"("rc\main.cpp)");
 
-  FileSpecList files;
+  SupportFileList files;
   files.Append(fullpath);
   files.Append(relative);
   files.Append(basename);

From c12a9fc2798cc74a6ea6a8c71826047f47c815ea Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 4 Jan 2024 11:14:46 -0600
Subject: [PATCH 251/313] [ELF] Correctly set the `nvptx` triple from
 `makeTriple()` (#76970)

Summary:
The ELFObject file should be able to handle `nvptx` objects but we
currently list them as unknown. This patch should now make it return
`nvptx64--` correctly.
---
 llvm/include/llvm/Object/ELFObjectFile.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index 99477644de4de..da78e11b678d9 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -1349,6 +1349,13 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
     return Triple::UnknownArch;
   }
 
+  case ELF::EM_CUDA: {
+    if (EF.getHeader().e_ident[ELF::EI_CLASS] == ELF::ELFCLASS32)
+      return Triple::nvptx;
+    else
+      return Triple::nvptx64;
+  }
+
   case ELF::EM_BPF:
     return IsLittleEndian ? Triple::bpfel : Triple::bpfeb;
 

From 5cd3cf107286d56cf162346d1bbbbfcc20439320 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Jan 2024 16:43:35 +0000
Subject: [PATCH 252/313] [X86] cvtv2f32.ll - replace X32 checks with X86. NFC.

We try to use X32 for gnux32 triples only.
---
 llvm/test/CodeGen/X86/cvtv2f32.ll | 68 +++++++++++++++----------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/llvm/test/CodeGen/X86/cvtv2f32.ll b/llvm/test/CodeGen/X86/cvtv2f32.ll
index a44f16509712a..b4b63caf4e91c 100644
--- a/llvm/test/CodeGen/X86/cvtv2f32.ll
+++ b/llvm/test/CodeGen/X86/cvtv2f32.ll
@@ -1,26 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s --check-prefix=X86
 ; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=corei7 | FileCheck %s --check-prefix=X64
 
 ; uitofp <2 x i32> codegen from buildvector or legalization is different but gives the same results
 ; across the full 0 - 0xFFFFFFFF u32 range.
 
 define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v) {
-; X32-LABEL: uitofp_2i32_cvt_buildvector:
-; X32:       # %bb.0:
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT:    orpd %xmm2, %xmm1
-; X32-NEXT:    subsd %xmm2, %xmm1
-; X32-NEXT:    cvtsd2ss %xmm1, %xmm1
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    orpd %xmm2, %xmm3
-; X32-NEXT:    subsd %xmm2, %xmm3
-; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    cvtsd2ss %xmm3, %xmm2
-; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; X32-NEXT:    mulps %xmm1, %xmm0
-; X32-NEXT:    retl
+; X86-LABEL: uitofp_2i32_cvt_buildvector:
+; X86:       # %bb.0:
+; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    orpd %xmm2, %xmm1
+; X86-NEXT:    subsd %xmm2, %xmm1
+; X86-NEXT:    cvtsd2ss %xmm1, %xmm1
+; X86-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-NEXT:    orpd %xmm2, %xmm3
+; X86-NEXT:    subsd %xmm2, %xmm3
+; X86-NEXT:    xorps %xmm2, %xmm2
+; X86-NEXT:    cvtsd2ss %xmm3, %xmm2
+; X86-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; X86-NEXT:    mulps %xmm1, %xmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_2i32_cvt_buildvector:
 ; X64:       # %bb.0:
@@ -40,15 +40,15 @@ define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v)
 }
 
 define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) {
-; X32-LABEL: uitofp_2i32_buildvector_cvt:
-; X32:       # %bb.0:
-; X32-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; X32-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
-; X32-NEXT:    por %xmm1, %xmm2
-; X32-NEXT:    subpd %xmm1, %xmm2
-; X32-NEXT:    cvtpd2ps %xmm2, %xmm1
-; X32-NEXT:    mulps %xmm1, %xmm0
-; X32-NEXT:    retl
+; X86-LABEL: uitofp_2i32_buildvector_cvt:
+; X86:       # %bb.0:
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X86-NEXT:    por %xmm1, %xmm2
+; X86-NEXT:    subpd %xmm1, %xmm2
+; X86-NEXT:    cvtpd2ps %xmm2, %xmm1
+; X86-NEXT:    mulps %xmm1, %xmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_2i32_buildvector_cvt:
 ; X64:       # %bb.0:
@@ -69,15 +69,15 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v)
 }
 
 define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
-; X32-LABEL: uitofp_2i32_legalized:
-; X32:       # %bb.0:
-; X32-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; X32-NEXT:    movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
-; X32-NEXT:    por %xmm2, %xmm0
-; X32-NEXT:    subpd %xmm2, %xmm0
-; X32-NEXT:    cvtpd2ps %xmm0, %xmm0
-; X32-NEXT:    mulps %xmm1, %xmm0
-; X32-NEXT:    retl
+; X86-LABEL: uitofp_2i32_legalized:
+; X86:       # %bb.0:
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; X86-NEXT:    por %xmm2, %xmm0
+; X86-NEXT:    subpd %xmm2, %xmm0
+; X86-NEXT:    cvtpd2ps %xmm0, %xmm0
+; X86-NEXT:    mulps %xmm1, %xmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_2i32_legalized:
 ; X64:       # %bb.0:

From 076dbc02724681c7d3664959d5ae742099b7edb6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Jan 2024 16:54:38 +0000
Subject: [PATCH 253/313] [X86] SimplifyDemandedVectorEltsForTargetNode - add
 X86ISD::VZEXT_LOAD handling.

Simplify to a scalar_to_vector(load()) if we don't demand any of the upper vector elements.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp     | 14 ++++++++++++++
 llvm/test/CodeGen/X86/buildvec-insertvec.ll |  6 ++----
 llvm/test/CodeGen/X86/fminimum-fmaximum.ll  |  5 ++---
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e0679f5f27d8c..fe3ba2ae29179 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41348,6 +41348,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       return TLO.CombineTo(Op, Src);
     break;
   }
+  case X86ISD::VZEXT_LOAD: {
+    // If upper demanded elements are not demanded then simplify to a
+    // scalar_to_vector(load()).
+    MVT SVT = VT.getSimpleVT().getVectorElementType();
+    if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
+      SDLoc DL(Op);
+      auto *Mem = cast<MemSDNode>(Op);
+      SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
+                                    Mem->getMemOperand());
+      SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
+      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
+    }
+    break;
+  }
   case X86ISD::VBROADCAST: {
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index a3568716edd9e..3fdfde8576f77 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -799,9 +799,8 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) {
 ;
 ; SSE41-LABEL: PR46586:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    movzbl 3(%rdi), %eax
 ; SSE41-NEXT:    extractps $3, %xmm0, %ecx
-; SSE41-NEXT:    pextrb $3, %xmm1, %eax
 ; SSE41-NEXT:    xorl %edx, %edx
 ; SSE41-NEXT:    divl %ecx
 ; SSE41-NEXT:    movl %edx, %eax
@@ -809,9 +808,8 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) {
 ;
 ; AVX-LABEL: PR46586:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    movzbl 3(%rdi), %eax
 ; AVX-NEXT:    vextractps $3, %xmm0, %ecx
-; AVX-NEXT:    vpextrb $3, %xmm1, %eax
 ; AVX-NEXT:    xorl %edx, %edx
 ; AVX-NEXT:    divl %ecx
 ; AVX-NEXT:    movl %edx, %eax
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index 5bb5d1e9c17ec..8905d2bce5e92 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -699,10 +699,9 @@ define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true"
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT:    vextractps $1, %xmm2, %eax
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vextractps $1, %xmm0, %eax
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    js .LBB14_1
 ; X86-NEXT:  # %bb.2:

From ce4459d5903fe53065f4c198cd71be6e514475c2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Jan 2024 17:11:35 +0000
Subject: [PATCH 254/313] [X86] 64-bit-shift-by-32-minus-y.ll - replace X32
 checks with X86. NFC.

We try to use X32 for gnux32 triples only.
---
 .../CodeGen/X86/64-bit-shift-by-32-minus-y.ll | 550 +++++++++---------
 1 file changed, 275 insertions(+), 275 deletions(-)

diff --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
index 705a1cc861e02..4c92adb25d0bd 100644
--- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
+++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown              | FileCheck %s --check-prefixes=X64-NOBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi2 | FileCheck %s --check-prefixes=X64-BMI2
-; RUN: llc < %s -mtriple=i686-unknown-unknown                | FileCheck %s --check-prefixes=X32-NOBMI2
-; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+bmi2 | FileCheck %s --check-prefixes=X32-BMI2
+; RUN: llc < %s -mtriple=i686-unknown-unknown                | FileCheck %s --check-prefixes=X86-NOBMI2
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+bmi2 | FileCheck %s --check-prefixes=X86-BMI2
 
 define i64 @t0(i64 %val, i64 %shamt) nounwind {
 ; X64-NOBMI2-LABEL: t0:
@@ -21,40 +21,40 @@ define i64 @t0(i64 %val, i64 %shamt) nounwind {
 ; X64-BMI2-NEXT:    shlxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
-; X32-NOBMI2-LABEL: t0:
-; X32-NOBMI2:       # %bb.0:
-; X32-NOBMI2-NEXT:    pushl %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOBMI2-NEXT:    movb $32, %cl
-; X32-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-NOBMI2-NEXT:    movl %esi, %eax
-; X32-NOBMI2-NEXT:    shll %cl, %eax
-; X32-NOBMI2-NEXT:    shldl %cl, %esi, %edx
-; X32-NOBMI2-NEXT:    testb $32, %cl
-; X32-NOBMI2-NEXT:    je .LBB0_2
-; X32-NOBMI2-NEXT:  # %bb.1:
-; X32-NOBMI2-NEXT:    movl %eax, %edx
-; X32-NOBMI2-NEXT:    xorl %eax, %eax
-; X32-NOBMI2-NEXT:  .LBB0_2:
-; X32-NOBMI2-NEXT:    popl %esi
-; X32-NOBMI2-NEXT:    retl
+; X86-NOBMI2-LABEL: t0:
+; X86-NOBMI2:       # %bb.0:
+; X86-NOBMI2-NEXT:    pushl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI2-NEXT:    movb $32, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl %esi, %eax
+; X86-NOBMI2-NEXT:    shll %cl, %eax
+; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
+; X86-NOBMI2-NEXT:    testb $32, %cl
+; X86-NOBMI2-NEXT:    je .LBB0_2
+; X86-NOBMI2-NEXT:  # %bb.1:
+; X86-NOBMI2-NEXT:    movl %eax, %edx
+; X86-NOBMI2-NEXT:    xorl %eax, %eax
+; X86-NOBMI2-NEXT:  .LBB0_2:
+; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: t0:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movb $32, %cl
-; X32-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-BMI2-NEXT:    shldl %cl, %eax, %edx
-; X32-BMI2-NEXT:    shlxl %ecx, %eax, %eax
-; X32-BMI2-NEXT:    testb $32, %cl
-; X32-BMI2-NEXT:    je .LBB0_2
-; X32-BMI2-NEXT:  # %bb.1:
-; X32-BMI2-NEXT:    movl %eax, %edx
-; X32-BMI2-NEXT:    xorl %eax, %eax
-; X32-BMI2-NEXT:  .LBB0_2:
-; X32-BMI2-NEXT:    retl
+; X86-BMI2-LABEL: t0:
+; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movb $32, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB0_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:  .LBB0_2:
+; X86-BMI2-NEXT:    retl
   %negshamt = sub i64 32, %shamt
   %shifted = shl i64 %val, %negshamt
   ret i64 %shifted
@@ -77,40 +77,40 @@ define i64 @n1(i64 %val, i64 %shamt) nounwind {
 ; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
-; X32-NOBMI2-LABEL: n1:
-; X32-NOBMI2:       # %bb.0:
-; X32-NOBMI2-NEXT:    pushl %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOBMI2-NEXT:    movb $33, %cl
-; X32-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-NOBMI2-NEXT:    movl %esi, %eax
-; X32-NOBMI2-NEXT:    shll %cl, %eax
-; X32-NOBMI2-NEXT:    shldl %cl, %esi, %edx
-; X32-NOBMI2-NEXT:    testb $32, %cl
-; X32-NOBMI2-NEXT:    je .LBB1_2
-; X32-NOBMI2-NEXT:  # %bb.1:
-; X32-NOBMI2-NEXT:    movl %eax, %edx
-; X32-NOBMI2-NEXT:    xorl %eax, %eax
-; X32-NOBMI2-NEXT:  .LBB1_2:
-; X32-NOBMI2-NEXT:    popl %esi
-; X32-NOBMI2-NEXT:    retl
+; X86-NOBMI2-LABEL: n1:
+; X86-NOBMI2:       # %bb.0:
+; X86-NOBMI2-NEXT:    pushl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI2-NEXT:    movb $33, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl %esi, %eax
+; X86-NOBMI2-NEXT:    shll %cl, %eax
+; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
+; X86-NOBMI2-NEXT:    testb $32, %cl
+; X86-NOBMI2-NEXT:    je .LBB1_2
+; X86-NOBMI2-NEXT:  # %bb.1:
+; X86-NOBMI2-NEXT:    movl %eax, %edx
+; X86-NOBMI2-NEXT:    xorl %eax, %eax
+; X86-NOBMI2-NEXT:  .LBB1_2:
+; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: n1:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movb $33, %cl
-; X32-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-BMI2-NEXT:    shldl %cl, %eax, %edx
-; X32-BMI2-NEXT:    shlxl %ecx, %eax, %eax
-; X32-BMI2-NEXT:    testb $32, %cl
-; X32-BMI2-NEXT:    je .LBB1_2
-; X32-BMI2-NEXT:  # %bb.1:
-; X32-BMI2-NEXT:    movl %eax, %edx
-; X32-BMI2-NEXT:    xorl %eax, %eax
-; X32-BMI2-NEXT:  .LBB1_2:
-; X32-BMI2-NEXT:    retl
+; X86-BMI2-LABEL: n1:
+; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movb $33, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB1_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:  .LBB1_2:
+; X86-BMI2-NEXT:    retl
   %negshamt = sub i64 33, %shamt
   %shifted = shl i64 %val, %negshamt
   ret i64 %shifted
@@ -131,40 +131,40 @@ define i64 @n2(i64 %val, i64 %shamt) nounwind {
 ; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
-; X32-NOBMI2-LABEL: n2:
-; X32-NOBMI2:       # %bb.0:
-; X32-NOBMI2-NEXT:    pushl %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOBMI2-NEXT:    movb $31, %cl
-; X32-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-NOBMI2-NEXT:    movl %esi, %eax
-; X32-NOBMI2-NEXT:    shll %cl, %eax
-; X32-NOBMI2-NEXT:    shldl %cl, %esi, %edx
-; X32-NOBMI2-NEXT:    testb $32, %cl
-; X32-NOBMI2-NEXT:    je .LBB2_2
-; X32-NOBMI2-NEXT:  # %bb.1:
-; X32-NOBMI2-NEXT:    movl %eax, %edx
-; X32-NOBMI2-NEXT:    xorl %eax, %eax
-; X32-NOBMI2-NEXT:  .LBB2_2:
-; X32-NOBMI2-NEXT:    popl %esi
-; X32-NOBMI2-NEXT:    retl
+; X86-NOBMI2-LABEL: n2:
+; X86-NOBMI2:       # %bb.0:
+; X86-NOBMI2-NEXT:    pushl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI2-NEXT:    movb $31, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl %esi, %eax
+; X86-NOBMI2-NEXT:    shll %cl, %eax
+; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
+; X86-NOBMI2-NEXT:    testb $32, %cl
+; X86-NOBMI2-NEXT:    je .LBB2_2
+; X86-NOBMI2-NEXT:  # %bb.1:
+; X86-NOBMI2-NEXT:    movl %eax, %edx
+; X86-NOBMI2-NEXT:    xorl %eax, %eax
+; X86-NOBMI2-NEXT:  .LBB2_2:
+; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: n2:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movb $31, %cl
-; X32-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-BMI2-NEXT:    shldl %cl, %eax, %edx
-; X32-BMI2-NEXT:    shlxl %ecx, %eax, %eax
-; X32-BMI2-NEXT:    testb $32, %cl
-; X32-BMI2-NEXT:    je .LBB2_2
-; X32-BMI2-NEXT:  # %bb.1:
-; X32-BMI2-NEXT:    movl %eax, %edx
-; X32-BMI2-NEXT:    xorl %eax, %eax
-; X32-BMI2-NEXT:  .LBB2_2:
-; X32-BMI2-NEXT:    retl
+; X86-BMI2-LABEL: n2:
+; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movb $31, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB2_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:  .LBB2_2:
+; X86-BMI2-NEXT:    retl
   %negshamt = sub i64 31, %shamt
   %shifted = shl i64 %val, %negshamt
   ret i64 %shifted
@@ -186,40 +186,40 @@ define i64 @t3(i64 %val, i64 %shamt) nounwind {
 ; X64-BMI2-NEXT:    shlxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
-; X32-NOBMI2-LABEL: t3:
-; X32-NOBMI2:       # %bb.0:
-; X32-NOBMI2-NEXT:    pushl %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOBMI2-NEXT:    movb $64, %cl
-; X32-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-NOBMI2-NEXT:    movl %esi, %eax
-; X32-NOBMI2-NEXT:    shll %cl, %eax
-; X32-NOBMI2-NEXT:    shldl %cl, %esi, %edx
-; X32-NOBMI2-NEXT:    testb $32, %cl
-; X32-NOBMI2-NEXT:    je .LBB3_2
-; X32-NOBMI2-NEXT:  # %bb.1:
-; X32-NOBMI2-NEXT:    movl %eax, %edx
-; X32-NOBMI2-NEXT:    xorl %eax, %eax
-; X32-NOBMI2-NEXT:  .LBB3_2:
-; X32-NOBMI2-NEXT:    popl %esi
-; X32-NOBMI2-NEXT:    retl
+; X86-NOBMI2-LABEL: t3:
+; X86-NOBMI2:       # %bb.0:
+; X86-NOBMI2-NEXT:    pushl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI2-NEXT:    movb $64, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl %esi, %eax
+; X86-NOBMI2-NEXT:    shll %cl, %eax
+; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
+; X86-NOBMI2-NEXT:    testb $32, %cl
+; X86-NOBMI2-NEXT:    je .LBB3_2
+; X86-NOBMI2-NEXT:  # %bb.1:
+; X86-NOBMI2-NEXT:    movl %eax, %edx
+; X86-NOBMI2-NEXT:    xorl %eax, %eax
+; X86-NOBMI2-NEXT:  .LBB3_2:
+; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: t3:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movb $64, %cl
-; X32-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-BMI2-NEXT:    shldl %cl, %eax, %edx
-; X32-BMI2-NEXT:    shlxl %ecx, %eax, %eax
-; X32-BMI2-NEXT:    testb $32, %cl
-; X32-BMI2-NEXT:    je .LBB3_2
-; X32-BMI2-NEXT:  # %bb.1:
-; X32-BMI2-NEXT:    movl %eax, %edx
-; X32-BMI2-NEXT:    xorl %eax, %eax
-; X32-BMI2-NEXT:  .LBB3_2:
-; X32-BMI2-NEXT:    retl
+; X86-BMI2-LABEL: t3:
+; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB3_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:  .LBB3_2:
+; X86-BMI2-NEXT:    retl
   %negshamt = sub i64 64, %shamt
   %shifted = shl i64 %val, %negshamt
   ret i64 %shifted
@@ -242,40 +242,40 @@ define i64 @t4(i64 %val, i64 %shamt) nounwind {
 ; X64-BMI2-NEXT:    shlxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
-; X32-NOBMI2-LABEL: t4:
-; X32-NOBMI2:       # %bb.0:
-; X32-NOBMI2-NEXT:    pushl %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOBMI2-NEXT:    movb $96, %cl
-; X32-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-NOBMI2-NEXT:    movl %esi, %eax
-; X32-NOBMI2-NEXT:    shll %cl, %eax
-; X32-NOBMI2-NEXT:    shldl %cl, %esi, %edx
-; X32-NOBMI2-NEXT:    testb $32, %cl
-; X32-NOBMI2-NEXT:    je .LBB4_2
-; X32-NOBMI2-NEXT:  # %bb.1:
-; X32-NOBMI2-NEXT:    movl %eax, %edx
-; X32-NOBMI2-NEXT:    xorl %eax, %eax
-; X32-NOBMI2-NEXT:  .LBB4_2:
-; X32-NOBMI2-NEXT:    popl %esi
-; X32-NOBMI2-NEXT:    retl
+; X86-NOBMI2-LABEL: t4:
+; X86-NOBMI2:       # %bb.0:
+; X86-NOBMI2-NEXT:    pushl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI2-NEXT:    movb $96, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl %esi, %eax
+; X86-NOBMI2-NEXT:    shll %cl, %eax
+; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
+; X86-NOBMI2-NEXT:    testb $32, %cl
+; X86-NOBMI2-NEXT:    je .LBB4_2
+; X86-NOBMI2-NEXT:  # %bb.1:
+; X86-NOBMI2-NEXT:    movl %eax, %edx
+; X86-NOBMI2-NEXT:    xorl %eax, %eax
+; X86-NOBMI2-NEXT:  .LBB4_2:
+; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: t4:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movb $96, %cl
-; X32-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X32-BMI2-NEXT:    shldl %cl, %eax, %edx
-; X32-BMI2-NEXT:    shlxl %ecx, %eax, %eax
-; X32-BMI2-NEXT:    testb $32, %cl
-; X32-BMI2-NEXT:    je .LBB4_2
-; X32-BMI2-NEXT:  # %bb.1:
-; X32-BMI2-NEXT:    movl %eax, %edx
-; X32-BMI2-NEXT:    xorl %eax, %eax
-; X32-BMI2-NEXT:  .LBB4_2:
-; X32-BMI2-NEXT:    retl
+; X86-BMI2-LABEL: t4:
+; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movb $96, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB4_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:  .LBB4_2:
+; X86-BMI2-NEXT:    retl
   %negshamt = sub i64 96, %shamt
   %shifted = shl i64 %val, %negshamt
   ret i64 %shifted
@@ -300,66 +300,66 @@ define i64 @t5_cse(i64 %val, i64 %shamt, ptr%dst) nounwind {
 ; X64-BMI2-NEXT:    shlxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
-; X32-NOBMI2-LABEL: t5_cse:
-; X32-NOBMI2:       # %bb.0:
-; X32-NOBMI2-NEXT:    pushl %ebx
-; X32-NOBMI2-NEXT:    pushl %edi
-; X32-NOBMI2-NEXT:    pushl %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NOBMI2-NEXT:    movl %eax, %ebx
-; X32-NOBMI2-NEXT:    addl $32, %ebx
-; X32-NOBMI2-NEXT:    adcl $0, %edi
-; X32-NOBMI2-NEXT:    movl %ebx, (%ecx)
-; X32-NOBMI2-NEXT:    movl %edi, 4(%ecx)
-; X32-NOBMI2-NEXT:    movb $32, %cl
-; X32-NOBMI2-NEXT:    subb %al, %cl
-; X32-NOBMI2-NEXT:    movl %esi, %eax
-; X32-NOBMI2-NEXT:    shll %cl, %eax
-; X32-NOBMI2-NEXT:    shldl %cl, %esi, %edx
-; X32-NOBMI2-NEXT:    testb $32, %cl
-; X32-NOBMI2-NEXT:    je .LBB5_2
-; X32-NOBMI2-NEXT:  # %bb.1:
-; X32-NOBMI2-NEXT:    movl %eax, %edx
-; X32-NOBMI2-NEXT:    xorl %eax, %eax
-; X32-NOBMI2-NEXT:  .LBB5_2:
-; X32-NOBMI2-NEXT:    popl %esi
-; X32-NOBMI2-NEXT:    popl %edi
-; X32-NOBMI2-NEXT:    popl %ebx
-; X32-NOBMI2-NEXT:    retl
+; X86-NOBMI2-LABEL: t5_cse:
+; X86-NOBMI2:       # %bb.0:
+; X86-NOBMI2-NEXT:    pushl %ebx
+; X86-NOBMI2-NEXT:    pushl %edi
+; X86-NOBMI2-NEXT:    pushl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI2-NEXT:    movl %eax, %ebx
+; X86-NOBMI2-NEXT:    addl $32, %ebx
+; X86-NOBMI2-NEXT:    adcl $0, %edi
+; X86-NOBMI2-NEXT:    movl %ebx, (%ecx)
+; X86-NOBMI2-NEXT:    movl %edi, 4(%ecx)
+; X86-NOBMI2-NEXT:    movb $32, %cl
+; X86-NOBMI2-NEXT:    subb %al, %cl
+; X86-NOBMI2-NEXT:    movl %esi, %eax
+; X86-NOBMI2-NEXT:    shll %cl, %eax
+; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
+; X86-NOBMI2-NEXT:    testb $32, %cl
+; X86-NOBMI2-NEXT:    je .LBB5_2
+; X86-NOBMI2-NEXT:  # %bb.1:
+; X86-NOBMI2-NEXT:    movl %eax, %edx
+; X86-NOBMI2-NEXT:    xorl %eax, %eax
+; X86-NOBMI2-NEXT:  .LBB5_2:
+; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    popl %edi
+; X86-NOBMI2-NEXT:    popl %ebx
+; X86-NOBMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: t5_cse:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    pushl %ebx
-; X32-BMI2-NEXT:    pushl %edi
-; X32-BMI2-NEXT:    pushl %esi
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-BMI2-NEXT:    movl %ebx, %edi
-; X32-BMI2-NEXT:    addl $32, %edi
-; X32-BMI2-NEXT:    adcl $0, %esi
-; X32-BMI2-NEXT:    movl %edi, (%ecx)
-; X32-BMI2-NEXT:    movl %esi, 4(%ecx)
-; X32-BMI2-NEXT:    movb $32, %cl
-; X32-BMI2-NEXT:    subb %bl, %cl
-; X32-BMI2-NEXT:    shldl %cl, %eax, %edx
-; X32-BMI2-NEXT:    shlxl %ecx, %eax, %eax
-; X32-BMI2-NEXT:    testb $32, %cl
-; X32-BMI2-NEXT:    je .LBB5_2
-; X32-BMI2-NEXT:  # %bb.1:
-; X32-BMI2-NEXT:    movl %eax, %edx
-; X32-BMI2-NEXT:    xorl %eax, %eax
-; X32-BMI2-NEXT:  .LBB5_2:
-; X32-BMI2-NEXT:    popl %esi
-; X32-BMI2-NEXT:    popl %edi
-; X32-BMI2-NEXT:    popl %ebx
-; X32-BMI2-NEXT:    retl
+; X86-BMI2-LABEL: t5_cse:
+; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    pushl %edi
+; X86-BMI2-NEXT:    pushl %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl %ebx, %edi
+; X86-BMI2-NEXT:    addl $32, %edi
+; X86-BMI2-NEXT:    adcl $0, %esi
+; X86-BMI2-NEXT:    movl %edi, (%ecx)
+; X86-BMI2-NEXT:    movl %esi, 4(%ecx)
+; X86-BMI2-NEXT:    movb $32, %cl
+; X86-BMI2-NEXT:    subb %bl, %cl
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB5_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:  .LBB5_2:
+; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    retl
   %incshamt = add i64 %shamt, 32
   store i64 %incshamt, ptr %dst
   %negshamt = sub i64 32, %shamt
@@ -386,56 +386,56 @@ define i64 @t6_cse2(i64 %val, i64 %shamt, ptr%dst) nounwind {
 ; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
-; X32-NOBMI2-LABEL: t6_cse2:
-; X32-NOBMI2:       # %bb.0:
-; X32-NOBMI2-NEXT:    pushl %edi
-; X32-NOBMI2-NEXT:    pushl %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOBMI2-NEXT:    xorl %edi, %edi
-; X32-NOBMI2-NEXT:    movl $32, %ecx
-; X32-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X32-NOBMI2-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X32-NOBMI2-NEXT:    movl %ecx, (%eax)
-; X32-NOBMI2-NEXT:    movl %edi, 4(%eax)
-; X32-NOBMI2-NEXT:    movl %esi, %eax
-; X32-NOBMI2-NEXT:    shll %cl, %eax
-; X32-NOBMI2-NEXT:    shldl %cl, %esi, %edx
-; X32-NOBMI2-NEXT:    testb $32, %cl
-; X32-NOBMI2-NEXT:    je .LBB6_2
-; X32-NOBMI2-NEXT:  # %bb.1:
-; X32-NOBMI2-NEXT:    movl %eax, %edx
-; X32-NOBMI2-NEXT:    xorl %eax, %eax
-; X32-NOBMI2-NEXT:  .LBB6_2:
-; X32-NOBMI2-NEXT:    popl %esi
-; X32-NOBMI2-NEXT:    popl %edi
-; X32-NOBMI2-NEXT:    retl
+; X86-NOBMI2-LABEL: t6_cse2:
+; X86-NOBMI2:       # %bb.0:
+; X86-NOBMI2-NEXT:    pushl %edi
+; X86-NOBMI2-NEXT:    pushl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    xorl %edi, %edi
+; X86-NOBMI2-NEXT:    movl $32, %ecx
+; X86-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI2-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI2-NEXT:    movl %edi, 4(%eax)
+; X86-NOBMI2-NEXT:    movl %esi, %eax
+; X86-NOBMI2-NEXT:    shll %cl, %eax
+; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
+; X86-NOBMI2-NEXT:    testb $32, %cl
+; X86-NOBMI2-NEXT:    je .LBB6_2
+; X86-NOBMI2-NEXT:  # %bb.1:
+; X86-NOBMI2-NEXT:    movl %eax, %edx
+; X86-NOBMI2-NEXT:    xorl %eax, %eax
+; X86-NOBMI2-NEXT:  .LBB6_2:
+; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    popl %edi
+; X86-NOBMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: t6_cse2:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    pushl %edi
-; X32-BMI2-NEXT:    pushl %esi
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-BMI2-NEXT:    xorl %edi, %edi
-; X32-BMI2-NEXT:    movl $32, %ecx
-; X32-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X32-BMI2-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X32-BMI2-NEXT:    movl %ecx, (%esi)
-; X32-BMI2-NEXT:    movl %edi, 4(%esi)
-; X32-BMI2-NEXT:    shldl %cl, %eax, %edx
-; X32-BMI2-NEXT:    shlxl %ecx, %eax, %eax
-; X32-BMI2-NEXT:    testb $32, %cl
-; X32-BMI2-NEXT:    je .LBB6_2
-; X32-BMI2-NEXT:  # %bb.1:
-; X32-BMI2-NEXT:    movl %eax, %edx
-; X32-BMI2-NEXT:    xorl %eax, %eax
-; X32-BMI2-NEXT:  .LBB6_2:
-; X32-BMI2-NEXT:    popl %esi
-; X32-BMI2-NEXT:    popl %edi
-; X32-BMI2-NEXT:    retl
+; X86-BMI2-LABEL: t6_cse2:
+; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %edi
+; X86-BMI2-NEXT:    pushl %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:    movl $32, %ecx
+; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    movl %ecx, (%esi)
+; X86-BMI2-NEXT:    movl %edi, 4(%esi)
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB6_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:  .LBB6_2:
+; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    retl
   %negshamt = sub i64 32, %shamt
   store i64 %negshamt, ptr %dst
   %shifted = shl i64 %val, %negshamt

From 63e30747815a20e9976c5bcedb81e8b44cbec582 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Jan 2024 17:12:12 +0000
Subject: [PATCH 255/313] [X86] aligned-variadic.ll - replace X32 checks with
 X86. NFC.

We try to use X32 for gnux32 triples only.
---
 llvm/test/CodeGen/X86/aligned-variadic.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/aligned-variadic.ll b/llvm/test/CodeGen/X86/aligned-variadic.ll
index 28cfff9d85a68..bc60662def2bf 100644
--- a/llvm/test/CodeGen/X86/aligned-variadic.ll
+++ b/llvm/test/CodeGen/X86/aligned-variadic.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=i686-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=i686-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X86
 
 %struct.Baz = type { [17 x i8] }
 %struct.__va_list_tag = type { i32, i32, ptr, ptr }
@@ -13,8 +13,8 @@ entry:
   %overflow_arg_area = load ptr, ptr %overflow_arg_area_p, align 8
   %overflow_arg_area.next = getelementptr i8, ptr %overflow_arg_area, i64 24
   store ptr %overflow_arg_area.next, ptr %overflow_arg_area_p, align 8
-; X32: leal    68(%esp), [[REG:%.*]]
-; X32: movl    [[REG]], 16(%esp)
+; X86: leal    68(%esp), [[REG:%.*]]
+; X86: movl    [[REG]], 16(%esp)
 ; X64: leaq    256(%rsp), [[REG:%.*]]
 ; X64: movq    [[REG]], 184(%rsp)
 ; X64: leaq    176(%rsp), %rdi

From 2cbf6526157958531e5765e7aa6faa53bfac5c5a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Jan 2024 17:13:20 +0000
Subject: [PATCH 256/313] [X86] avx512-pmovxrm.ll - replace X32 checks with
 X86. NFC.

We try to use X32 for gnux32 triples only.
---
 llvm/test/CodeGen/X86/avx512-pmovxrm.ll | 122 ++++++++++++------------
 1 file changed, 61 insertions(+), 61 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avx512-pmovxrm.ll b/llvm/test/CodeGen/X86/avx512-pmovxrm.ll
index 0b3190be62a53..237ae7a5c64ad 100644
--- a/llvm/test/CodeGen/X86/avx512-pmovxrm.ll
+++ b/llvm/test/CodeGen/X86/avx512-pmovxrm.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin -mattr=+avx512f,avx512bw | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin -mattr=+avx512f,avx512bw | FileCheck %s --check-prefix=X86
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,avx512bw | FileCheck %s --check-prefix=X64
 
 define <32 x i16> @test_llvm_x86_avx512_pmovsxbw(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovsxbw:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovsxbw (%eax), %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovsxbw:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovsxbw (%eax), %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovsxbw:
 ; X64:       ## %bb.0:
@@ -19,11 +19,11 @@ define <32 x i16> @test_llvm_x86_avx512_pmovsxbw(ptr %a) {
 }
 
 define <16 x i32> @test_llvm_x86_avx512_pmovsxbd(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovsxbd:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovsxbd (%eax), %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovsxbd:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovsxbd (%eax), %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovsxbd:
 ; X64:       ## %bb.0:
@@ -35,11 +35,11 @@ define <16 x i32> @test_llvm_x86_avx512_pmovsxbd(ptr %a) {
 }
 
 define <8 x i64> @test_llvm_x86_avx512_pmovsxbq(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovsxbq:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovsxbq (%eax), %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovsxbq:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovsxbq (%eax), %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovsxbq:
 ; X64:       ## %bb.0:
@@ -52,11 +52,11 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxbq(ptr %a) {
 }
 
 define <16 x i32> @test_llvm_x86_avx512_pmovsxwd(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovsxwd:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovsxwd (%eax), %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovsxwd:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovsxwd (%eax), %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovsxwd:
 ; X64:       ## %bb.0:
@@ -68,11 +68,11 @@ define <16 x i32> @test_llvm_x86_avx512_pmovsxwd(ptr %a) {
 }
 
 define <8 x i64> @test_llvm_x86_avx512_pmovsxwq(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovsxwq:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovsxwq (%eax), %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovsxwq:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovsxwq (%eax), %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovsxwq:
 ; X64:       ## %bb.0:
@@ -84,11 +84,11 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxwq(ptr %a) {
 }
 
 define <8 x i64> @test_llvm_x86_avx512_pmovsxdq(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovsxdq:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovsxdq (%eax), %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovsxdq:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovsxdq (%eax), %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovsxdq:
 ; X64:       ## %bb.0:
@@ -100,11 +100,11 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxdq(ptr %a) {
 }
 
 define <32 x i16> @test_llvm_x86_avx512_pmovzxbw(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovzxbw:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovzxbw {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovzxbw:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovzxbw {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovzxbw:
 ; X64:       ## %bb.0:
@@ -116,11 +116,11 @@ define <32 x i16> @test_llvm_x86_avx512_pmovzxbw(ptr %a) {
 }
 
 define <16 x i32> @test_llvm_x86_avx512_pmovzxbd(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovzxbd:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovzxbd:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovzxbd:
 ; X64:       ## %bb.0:
@@ -132,11 +132,11 @@ define <16 x i32> @test_llvm_x86_avx512_pmovzxbd(ptr %a) {
 }
 
 define <8 x i64> @test_llvm_x86_avx512_pmovzxbq(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovzxbq:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovzxbq:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovzxbq:
 ; X64:       ## %bb.0:
@@ -149,11 +149,11 @@ define <8 x i64> @test_llvm_x86_avx512_pmovzxbq(ptr %a) {
 }
 
 define <16 x i32> @test_llvm_x86_avx512_pmovzxwd(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovzxwd:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovzxwd:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovzxwd:
 ; X64:       ## %bb.0:
@@ -165,11 +165,11 @@ define <16 x i32> @test_llvm_x86_avx512_pmovzxwd(ptr %a) {
 }
 
 define <8 x i64> @test_llvm_x86_avx512_pmovzxwq(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovzxwq:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovzxwq {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovzxwq:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovzxwq {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovzxwq:
 ; X64:       ## %bb.0:
@@ -181,11 +181,11 @@ define <8 x i64> @test_llvm_x86_avx512_pmovzxwq(ptr %a) {
 }
 
 define <8 x i64> @test_llvm_x86_avx512_pmovzxdq(ptr %a) {
-; X32-LABEL: test_llvm_x86_avx512_pmovzxdq:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovzxdq {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X32-NEXT:    retl
+; X86-LABEL: test_llvm_x86_avx512_pmovzxdq:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpmovzxdq {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_llvm_x86_avx512_pmovzxdq:
 ; X64:       ## %bb.0:

From db9a16eaedd67f5b7c2dad68f1e3f2799d86d590 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 4 Jan 2024 17:24:22 +0000
Subject: [PATCH 257/313] [mlir][nfc] Update comments in the Linalg vectoriser
 (#76797)

---
 .../Linalg/Transforms/Vectorization.cpp        | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index be813df8e782f..5d99951ef09a9 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -894,10 +894,16 @@ static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
   return result;
 }
 
-/// Check whether \p extractOp would be a gather or a contiguous load Op after
-/// vectorising \p linalgOp. Note that it is always safe to use gather load
-/// operations for contiguous loads (albeit slow), but not vice-versa. When in
-/// doubt, bail out and assume that \p extractOp is a gather load.
+/// Infer the memory access pattern for the input ExtractOp
+///
+/// Based on the operation shapes and indices (usually based on the iteration
+/// space of the parent `linalgOp` operation), decides whether the input
+/// ExtractOp is a contiguous load (including a broadcast of a scalar) or a
+/// gather load.
+///
+/// Note that it is always safe to use gather load operations for contiguous
+/// loads (albeit slow), but not vice-versa. When in doubt, bail out and assume
+/// that `extractOp` is a gather load.
 static VectorMemoryAccessKind
 getTensorExtractMemoryAccessPattern(tensor::ExtractOp extractOp,
                                     LinalgOp &linalgOp) {
@@ -916,8 +922,8 @@ getTensorExtractMemoryAccessPattern(tensor::ExtractOp extractOp,
     return VectorMemoryAccessKind::Gather;
 
   // 1. Assume that it's a gather load when reading _into_:
-  //    * an n-D vector, like`tensor<1x2x4xi32` or`tensor<2x1x4xi32>`, or
-  //    * a 1-D vector with the trailing dim equal 1, e.g. `tensor<1x4x1xi32`.
+  //    * an n-D "vector", like `tensor<1x2x4xi32` or `tensor<2x1x4xi32>`, or
+  //    * a 1-D "vector" with the trailing dim equal 1, e.g. `tensor<1x4x1xi32`.
   // TODO: Relax these conditions.
   // FIXME: This condition assumes non-dynamic sizes.
   if ((llvm::count_if(targetShape,

From 79a2e2b9e8d0dffa602375c2386371666b412272 Mon Sep 17 00:00:00 2001
From: Nishant Mittal <nishantwrp@google.com>
Date: Thu, 4 Jan 2024 22:54:54 +0530
Subject: [PATCH 258/313] [libc][math] Fix `is_quiet_nan` function in FPBits
 (#76931)

---
 libc/src/__support/FPUtil/FPBits.h                |  8 ++++----
 libc/src/__support/FPUtil/x86_64/LongDoubleBits.h |  4 ++++
 libc/test/src/__support/FPUtil/fpbits_test.cpp    | 15 +++++++++++++++
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 63eeba85f15e5..93e32ba7cc941 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -162,9 +162,9 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
           ? bit_at(SIG_LEN - 1) | bit_at(SIG_LEN - 2) // 0b1100...
           : bit_at(SIG_LEN - 1);                      // 0b1000...
 
-  // If a number x is a NAN, then it is a signalling NAN if:
-  //   SIGNALING_NAN_MASK & bits(x) != 0
-  LIBC_INLINE_VAR static constexpr StorageType SIGNALING_NAN_MASK =
+  // Mask to generate a default signaling NAN. Any NAN that is not
+  // a quiet NAN is considered a signaling NAN.
+  LIBC_INLINE_VAR static constexpr StorageType DEFAULT_SIGNALING_NAN =
       fp_type == FPType::X86_Binary80
           ? bit_at(SIG_LEN - 1) | bit_at(SIG_LEN - 3) // 0b1010...
           : bit_at(SIG_LEN - 2);                      // 0b0100...
@@ -356,7 +356,7 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
   }
 
   LIBC_INLINE constexpr bool is_quiet_nan() const {
-    return (bits & EXP_SIG_MASK) == (EXP_MASK | QUIET_NAN_MASK);
+    return (bits & EXP_SIG_MASK) >= (EXP_MASK | QUIET_NAN_MASK);
   }
 
   LIBC_INLINE constexpr bool is_inf_or_nan() const {
diff --git a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
index 8abc0c87af0d2..c18abcee77ea5 100644
--- a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
+++ b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
@@ -114,6 +114,10 @@ struct FPBits<long double> : public internal::FPRep<FPType::X86_Binary80> {
            (get_biased_exponent() != 0 && get_implicit_bit() == 0);
   }
 
+  LIBC_INLINE constexpr bool is_quiet_nan() const {
+    return (bits & EXP_SIG_MASK) >= (EXP_MASK | QUIET_NAN_MASK);
+  }
+
   // Methods below this are used by tests.
 
   LIBC_INLINE static constexpr long double zero(bool sign = false) {
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index fa743855c4861..e2dbe248ef213 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -69,6 +69,9 @@ TEST(LlvmLibcFPBitsTest, FloatType) {
   EXPECT_EQ(negnum.uintval(), static_cast<uint32_t>(0xBF900000));
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBF900000 = (S: 1, E: 0x007F, M: 0x00100000)");
+
+  FloatBits quiet_nan = FloatBits(FloatBits::build_quiet_nan(1));
+  EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
 TEST(LlvmLibcFPBitsTest, DoubleType) {
@@ -129,6 +132,9 @@ TEST(LlvmLibcFPBitsTest, DoubleType) {
   EXPECT_EQ(negnum.uintval(), static_cast<uint64_t>(0xBFF2000000000000));
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBFF2000000000000 = (S: 1, E: 0x03FF, M: 0x0002000000000000)");
+
+  DoubleBits quiet_nan = DoubleBits(DoubleBits::build_quiet_nan(1));
+  EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
 #ifdef LIBC_TARGET_ARCH_IS_X86
@@ -210,6 +216,9 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
       LIBC_NAMESPACE::str(negnum).c_str(),
       "0x000000000000BFFF9000000000000000 = "
       "(S: 1, E: 0x3FFF, I: 1, M: 0x00000000000000001000000000000000)");
+
+  LongDoubleBits quiet_nan = LongDoubleBits(LongDoubleBits::build_quiet_nan(1));
+  EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #else
 TEST(LlvmLibcFPBitsTest, LongDoubleType) {
@@ -284,6 +293,9 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
+
+  LongDoubleBits quiet_nan = LongDoubleBits(LongDoubleBits::build_quiet_nan(1));
+  EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 #endif
 }
 #endif
@@ -357,5 +369,8 @@ TEST(LlvmLibcFPBitsTest, Float128Type) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
+
+  Float128Bits quiet_nan = Float128Bits(Float128Bits::build_quiet_nan(1));
+  EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #endif // LIBC_COMPILER_HAS_FLOAT128

From 9f9dd6be0d21a156dcfee01ebbd571eca79b08bd Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 4 Jan 2024 09:28:45 -0800
Subject: [PATCH 259/313] Wrap local type declarations in anonymous namespace
 to fix modules build.

---
 lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
index 5326a73166e72..1688fb27430a7 100644
--- a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
+++ b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
@@ -35,6 +35,7 @@ using namespace lldb_private;
 
 LLDB_PLUGIN_DEFINE(JITLoaderGDB)
 
+namespace {
 // Debug Interface Structures
 enum jit_actions_t { JIT_NOACTION = 0, JIT_REGISTER_FN, JIT_UNREGISTER_FN };
 
@@ -52,7 +53,6 @@ template <typename ptr_t> struct jit_descriptor {
   ptr_t first_entry;    // pointer
 };
 
-namespace {
 enum EnableJITLoaderGDB {
   eEnableJITLoaderGDBDefault,
   eEnableJITLoaderGDBOn,

From e78a1f491cbc0a57de7bf86058359dd0bd282540 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E9=9B=A8=E5=9F=B9?= <liuyupei951018@hotmail.com>
Date: Fri, 5 Jan 2024 01:32:10 +0800
Subject: [PATCH 260/313] [Clang] Fix the instantiation of return type
 requirements in lambda bodies (#76967)

Currently, due to the incomplete implementation of p0588r1, the
instantiation of lambda expressions leads to the instantiation of the
body. And `EvaluateConstraints` is false during the instantiation of the
body, which causes crashes during the instantiation of the return type
requirement:

```cpp
template<typename T> concept doesnt_matter = true;

template<class T>
concept test =
    []{
        return requires(T t) {
            { t } -> doesnt_matter; // crash
        };
    }();

static_assert(test<int>);
```

Although a complete implementation of p0588r1 can solve these crashes,
it will take some time. Therefore, this pull request aims to fix these
crashes first.

Fixes https://github.com/llvm/llvm-project/issues/63808
Fixes https://github.com/llvm/llvm-project/issues/64607
Fixes https://github.com/llvm/llvm-project/issues/64086
---
 clang/docs/ReleaseNotes.rst                 |  5 ++++
 clang/lib/Sema/SemaTemplateInstantiate.cpp  | 17 +++++++++++
 clang/test/SemaTemplate/concepts-lambda.cpp | 33 +++++++++++++++++++++
 3 files changed, 55 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c7bf162426a68..e5de042cebd4c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -844,6 +844,11 @@ Bug Fixes to C++ Support
 - Fix crash when parsing nested requirement. Fixes:
   (`#73112 <https://github.com/llvm/llvm-project/issues/73112>`_)
 
+- Fixed a crash caused by using return type requirement in a lambda. Fixes:
+  (`#63808 <https://github.com/llvm/llvm-project/issues/63808>`_)
+  (`#64607 <https://github.com/llvm/llvm-project/issues/64607>`_)
+  (`#64086 <https://github.com/llvm/llvm-project/issues/64086>`_)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 37e5b9cad08bc..e1cbdcd72eac1 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1479,6 +1479,23 @@ namespace {
       return Result;
     }
 
+    StmtResult TransformLambdaBody(LambdaExpr *E, Stmt *Body) {
+      // Currently, we instantiate the body when instantiating the lambda
+      // expression. However, `EvaluateConstraints` is disabled during the
+      // instantiation of the lambda expression, causing the instantiation
+      // failure of the return type requirement in the body. If p0588r1 is fully
+      // implemented, the body will be lazily instantiated, and this problem
+      // will not occur. Here, `EvaluateConstraints` is temporarily set to
+      // `true` to temporarily fix this issue.
+      // FIXME: This temporary fix can be removed after fully implementing
+      // p0588r1.
+      bool Prev = EvaluateConstraints;
+      EvaluateConstraints = true;
+      StmtResult Stmt = inherited::TransformLambdaBody(E, Body);
+      EvaluateConstraints = Prev;
+      return Stmt;
+    }
+
     ExprResult TransformRequiresExpr(RequiresExpr *E) {
       LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
       ExprResult TransReq = inherited::TransformRequiresExpr(E);
diff --git a/clang/test/SemaTemplate/concepts-lambda.cpp b/clang/test/SemaTemplate/concepts-lambda.cpp
index 8a184cbf4e9bc..7e431529427df 100644
--- a/clang/test/SemaTemplate/concepts-lambda.cpp
+++ b/clang/test/SemaTemplate/concepts-lambda.cpp
@@ -116,3 +116,36 @@ static_assert(E<int>);
 // expected-note@-11{{because 'Q.template operator()<float>()' would be invalid: no matching member function for call to 'operator()'}}
 }
 }
+
+namespace ReturnTypeRequirementInLambda {
+template <typename T>
+concept C1 = true;
+
+template <class T>
+concept test = [] {
+  return requires(T t) {
+    { t } -> C1;
+  };
+}();
+
+static_assert(test<int>);
+
+template <typename T>
+concept C2 = true;
+struct S1 {
+  int f1() { return 1; }
+};
+
+void foo() {
+  auto make_caller = []<auto member> {
+    return [](S1 *ps) {
+      if constexpr (requires {
+                      { (ps->*member)() } -> C2;
+                    })
+        ;
+    };
+  };
+
+  auto caller = make_caller.operator()<&S1::f1>();
+}
+} // namespace ReturnTypeRequirementInLambda

From 4004f655ceb9623608ba0471aa7037c142956e31 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Thu, 4 Jan 2024 12:27:03 -0500
Subject: [PATCH 261/313] [LLDB][NativePDB] Fix use-after-free error detected
 by asan.

---
 .../Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index 8375010ae3ded..9234768323e71 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -2265,7 +2265,8 @@ void SymbolFileNativePDB::BuildParentMap() {
   }
   for (TypeIndex fwd : fwd_keys) {
     TypeIndex full = forward_to_full[fwd];
-    m_parent_types[full] = m_parent_types[fwd];
+    TypeIndex parent_idx = m_parent_types[fwd];
+    m_parent_types[full] = parent_idx;
   }
   for (TypeIndex full : full_keys) {
     TypeIndex fwd = full_to_forward[full];

From a960703466e937d99ab7a7a29f7448e1bc926e35 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 4 Jan 2024 09:48:40 -0800
Subject: [PATCH 262/313] [RISCV] Remove incomplete PRE_DEC/POST_DEC code for
 XTHeadMemIdx. (#76922)

As far as I can tell if getIndexedAddressParts received an ISD::SUB, the
constant would be negated. So `IsInc` should be set to true since the
SUB was effectively converted to ADD. This means we should never use
PRE_DEC/POST_DEC.

No tests are affected because DAGCombine aggressively turns SUB with
constant into ADD so no lit test has a SUB reach getIndexedAddressParts.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 10 ++++------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 15 +++++----------
 llvm/lib/Target/RISCV/RISCVISelLowering.h   |  3 +--
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index bfa3bf3cc74e2..7257c2e8fe1f6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -763,14 +763,12 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) {
     return false;
 
   EVT LoadVT = Ld->getMemoryVT();
-  bool IsPre = (AM == ISD::PRE_INC || AM == ISD::PRE_DEC);
-  bool IsPost = (AM == ISD::POST_INC || AM == ISD::POST_DEC);
+  assert(AM == ISD::PRE_INC ||
+         AM == ISD::POST_INC && "Unexpected addressing mode");
+  bool IsPre = AM == ISD::PRE_INC;
+  bool IsPost = AM == ISD::POST_INC;
   int64_t Offset = C->getSExtValue();
 
-  // Convert decrements to increments by a negative quantity.
-  if (AM == ISD::PRE_DEC || AM == ISD::POST_DEC)
-    Offset = -Offset;
-
   // The constants that can be encoded in the THeadMemIdx instructions
   // are of the form (sign_extend(imm5) << imm2).
   int64_t Shift;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ae0e6ef101329..bc4b2b022c0ae 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1350,8 +1350,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget.hasVendorXTHeadMemIdx()) {
-    for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::POST_DEC;
-         ++im) {
+    for (unsigned im : {ISD::PRE_INC, ISD::POST_INC}) {
       setIndexedLoadAction(im, MVT::i8, Legal);
       setIndexedStoreAction(im, MVT::i8, Legal);
       setIndexedLoadAction(im, MVT::i16, Legal);
@@ -19269,7 +19268,6 @@ bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
 bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
                                                  SDValue &Offset,
                                                  ISD::MemIndexedMode &AM,
-                                                 bool &IsInc,
                                                  SelectionDAG &DAG) const {
   // Target does not support indexed loads.
   if (!Subtarget.hasVendorXTHeadMemIdx())
@@ -19296,7 +19294,6 @@ bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
     if (!isLegalIndexedOffset)
       return false;
 
-    IsInc = (Op->getOpcode() == ISD::ADD);
     Offset = Op->getOperand(1);
     return true;
   }
@@ -19319,11 +19316,10 @@ bool RISCVTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   } else
     return false;
 
-  bool IsInc;
-  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, DAG))
     return false;
 
-  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  AM = ISD::PRE_INC;
   return true;
 }
 
@@ -19343,15 +19339,14 @@ bool RISCVTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   } else
     return false;
 
-  bool IsInc;
-  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+  if (!getIndexedAddressParts(Op, Base, Offset, AM, DAG))
     return false;
   // Post-indexing updates the base, so it's not a valid transform
   // if that's not the same as the load's pointer.
   if (Ptr != Base)
     return false;
 
-  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+  AM = ISD::POST_INC;
   return true;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 226b51ddda6e4..18f5805755816 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -774,8 +774,7 @@ class RISCVTargetLowering : public TargetLowering {
   bool isVScaleKnownToBeAPowerOfTwo() const override;
 
   bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
-                              ISD::MemIndexedMode &AM, bool &IsInc,
-                              SelectionDAG &DAG) const;
+                              ISD::MemIndexedMode &AM, SelectionDAG &DAG) const;
   bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
                                  ISD::MemIndexedMode &AM,
                                  SelectionDAG &DAG) const override;

From 4f59a38821a2175408a3189327223b85ddba636f Mon Sep 17 00:00:00 2001
From: Pete Steinfeld <47540744+psteinfeld@users.noreply.github.com>
Date: Thu, 4 Jan 2024 10:19:50 -0800
Subject: [PATCH 263/313] Revert #76194 (#76987)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[Flang] Revert "Allow Intrinsic simpification with min/maxloc dim
and…scalar result (#76194)"

This reverts commit 9b7cf5bfb08b6e506216ef354dfd61adb15acbff.

See merge request #76194.

This change was causing several failures in our internal tests. I'm
reverting now and will work on creating a test that David Green can use
to reproduce the problem.
---
 .../Transforms/SimplifyIntrinsics.cpp         | 15 ++--
 flang/test/Transforms/simplifyintrinsics.fir  | 68 ++-----------------
 2 files changed, 13 insertions(+), 70 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index f5ddcbbaecd21..c89ee6d5e2039 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -1162,14 +1162,11 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
 
   mlir::Operation::operand_range args = call.getArgs();
 
-  mlir::SymbolRefAttr callee = call.getCalleeAttr();
-  mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
-  bool isDim = funcNameBase.ends_with("Dim");
-  mlir::Value back = args[isDim ? 7 : 6];
+  mlir::Value back = args[6];
   if (isTrueOrNotConstant(back))
     return;
 
-  mlir::Value mask = args[isDim ? 6 : 5];
+  mlir::Value mask = args[5];
   mlir::Value maskDef = findMaskDef(mask);
 
   // maskDef is set to NULL when the defining op is not one we accept.
@@ -1178,8 +1175,10 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   if (maskDef == NULL)
     return;
 
+  mlir::SymbolRefAttr callee = call.getCalleeAttr();
+  mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
   unsigned rank = getDimCount(args[1]);
-  if ((isDim && rank != 1) || !(rank > 0))
+  if (funcNameBase.ends_with("Dim") || !(rank > 0))
     return;
 
   fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
@@ -1220,8 +1219,6 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
 
   llvm::raw_string_ostream nameOS(funcName);
   outType.print(nameOS);
-  if (isDim)
-    nameOS << '_' << inputType;
   nameOS << '_' << fmfString;
 
   auto typeGenerator = [rank](fir::FirOpBuilder &builder) {
@@ -1237,7 +1234,7 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   mlir::func::FuncOp newFunc =
       getOrCreateFunction(builder, funcName, typeGenerator, bodyGenerator);
   builder.create<fir::CallOp>(loc, newFunc,
-                              mlir::ValueRange{args[0], args[1], mask});
+                              mlir::ValueRange{args[0], args[1], args[5]});
   call->dropAllReferences();
   call->erase();
 }
diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir
index 61cddd4f48df8..0bd6ac7c436ff 100644
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@@ -2115,13 +2115,13 @@ func.func @_QPtestminloc_doesntwork1d_back(%arg0: !fir.ref<!fir.array<10xi32>> {
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
-// Check Minloc is simplified when DIM arg is set so long as the result is scalar
+// Check Minloc is not simplified when DIM arg is set
 
-func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
+func.func @_QPtestminloc_doesntwork1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
   %0 = fir.alloca !fir.box<!fir.heap<i32>>
   %c10 = arith.constant 10 : index
   %c1 = arith.constant 1 : index
-  %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_1d_dim", uniq_name = "_QFtestminloc_1d_dimEtestminloc_1d_dim"}
+  %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_doesntwork1d_dim", uniq_name = "_QFtestminloc_doesntwork1d_dimEtestminloc_doesntwork1d_dim"}
   %2 = fir.shape %c1 : (index) -> !fir.shape<1>
   %3 = fir.array_load %1(%2) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.array<1xi32>
   %4 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -2156,65 +2156,11 @@ func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_n
   %21 = fir.load %1 : !fir.ref<!fir.array<1xi32>>
   return %21 : !fir.array<1xi32>
 }
-// CHECK-LABEL:   func.func @_QPtestminloc_1d_dim(
+// CHECK-LABEL:   func.func @_QPtestminloc_doesntwork1d_dim(
 // CHECK-SAME:                                             %[[ARR:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
-// CHECK:             fir.call @_FortranAMinlocDimx1_i32_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-
-// CHECK-LABEL:  func.func private @_FortranAMinlocDimx1_i32_i32_contract_simplified(%arg0: !fir.ref<!fir.box<none>>, %arg1: !fir.box<none>, %arg2: !fir.box<none>) attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
-// CHECK-NEXT:    %[[V0:.*]] = fir.alloca i32
-// CHECK-NEXT:    %c0_i32 = arith.constant 0 : i32
-// CHECK-NEXT:    %c1 = arith.constant 1 : index
-// CHECK-NEXT:    %[[V1:.*]] = fir.allocmem !fir.array<1xi32>
-// CHECK-NEXT:    %[[V2:.*]] = fir.shape %c1 : (index) -> !fir.shape<1>
-// CHECK-NEXT:    %[[V3:.*]] = fir.embox %[[V1]](%[[V2]]) : (!fir.heap<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<1xi32>>>
-// CHECK-NEXT:    %c0 = arith.constant 0 : index
-// CHECK-NEXT:    %[[V4:.*]] = fir.coordinate_of %[[V3]], %c0 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:    fir.store %c0_i32 to %[[V4]] : !fir.ref<i32>
-// CHECK-NEXT:    %c0_0 = arith.constant 0 : index
-// CHECK-NEXT:    %[[V5:.*]] = fir.convert %arg1 : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
-// CHECK-NEXT:    %c1_i32 = arith.constant 1 : i32
-// CHECK-NEXT:    %c0_i32_1 = arith.constant 0 : i32
-// CHECK-NEXT:    fir.store %c0_i32_1 to %[[V0]] : !fir.ref<i32>
-// CHECK-NEXT:    %c2147483647_i32 = arith.constant 2147483647 : i32
-// CHECK-NEXT:    %c1_2 = arith.constant 1 : index
-// CHECK-NEXT:    %c0_3 = arith.constant 0 : index
-// CHECK-NEXT:    %[[V6:.*]]:3 = fir.box_dims %[[V5]], %c0_3 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK-NEXT:    %[[V7:.*]] = arith.subi %[[V6]]#1, %c1_2 : index
-// CHECK-NEXT:    %[[V8:.*]] = fir.do_loop %arg3 = %c0_0 to %[[V7]] step %c1_2 iter_args(%arg4 = %c2147483647_i32) -> (i32) {
-// CHECK-NEXT:      fir.store %c1_i32 to %[[V0]] : !fir.ref<i32>
-// CHECK-NEXT:      %[[V12:.*]] = fir.coordinate_of %[[V5]], %arg3 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:      %[[V13:.*]] = fir.load %[[V12]] : !fir.ref<i32>
-// CHECK-NEXT:      %[[V14:.*]] = arith.cmpi slt, %[[V13]], %arg4 : i32
-// CHECK-NEXT:      %[[V15:.*]] = fir.if %[[V14]] -> (i32) {
-// CHECK-NEXT:        %c1_i32_4 = arith.constant 1 : i32
-// CHECK-NEXT:        %c0_5 = arith.constant 0 : index
-// CHECK-NEXT:        %[[V16:.*]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:        %[[V17:.*]] = fir.convert %arg3 : (index) -> i32
-// CHECK-NEXT:        %[[V18:.*]] = arith.addi %[[V17]], %c1_i32_4 : i32
-// CHECK-NEXT:        fir.store %[[V18]] to %[[V16]] : !fir.ref<i32>
-// CHECK-NEXT:        fir.result %[[V13]] : i32
-// CHECK-NEXT:      } else {
-// CHECK-NEXT:        fir.result %arg4 : i32
-// CHECK-NEXT:      }
-// CHECK-NEXT:      fir.result %[[V15]] : i32
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[V9:.*]] = fir.load %[[V0]] : !fir.ref<i32>
-// CHECK-NEXT:    %[[V10:.*]] = arith.cmpi eq, %[[V9]], %c1_i32 : i32
-// CHECK-NEXT:    fir.if %[[V10]] {
-// CHECK-NEXT:      %c2147483647_i32_4 = arith.constant 2147483647 : i32
-// CHECK-NEXT:      %[[V12]] = arith.cmpi eq, %c2147483647_i32_4, %[[V8]] : i32
-// CHECK-NEXT:      fir.if %[[V12]] {
-// CHECK-NEXT:        %c0_5 = arith.constant 0 : index
-// CHECK-NEXT:        %[[V13]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:        fir.store %c1_i32 to %[[V13]] : !fir.ref<i32>
-// CHECK-NEXT:      }
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[V11:.*]] = fir.convert %arg0 : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<1xi32>>>>
-// CHECK-NEXT:    fir.store %[[V3]] to %[[V11]] : !fir.ref<!fir.box<!fir.heap<!fir.array<1xi32>>>>
-// CHECK-NEXT:    return
-// CHECK-NEXT:  }
-
-
+// CHECK-NOT:         fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+// CHECK:             fir.call @_FortranAMinlocDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NOT:         fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
 // Check Minloc is not simplified when dimension of inputArr is unknown

From c041fa1093c3ad7be040fb362a10ca3900c698a4 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 4 Jan 2024 10:25:27 -0800
Subject: [PATCH 264/313] XFAIL test with dsymutil

---
 .../functionalities/inline-sourcefile/TestInlineSourceFiles.py  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py b/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
index 20ed0ce00661f..ce7ac6fc503ed 100644
--- a/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
+++ b/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
@@ -8,6 +8,8 @@
 class InlineSourceFilesTestCase(TestBase):
     @skipIf(compiler="gcc")
     @skipIf(compiler="clang", compiler_version=["<", "18.0"])
+    # dsymutil doesn't yet copy the sources
+    @expectedFailureDarwin(debug_info=["dsym"])
     def test(self):
         """Test DWARF inline source files."""
         self.build()

From 85939e5e248213dfdf66fc8305ed502fc2f3f1f0 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Thu, 4 Jan 2024 10:23:38 -0800
Subject: [PATCH 265/313] [mlir][openacc][NFC] Rename custom parser from
 WaitOperands to DeviceTypeOperandsWithSegment

---
 mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td |  9 +++++----
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp         | 11 +++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 234c1076e14e3..4312bd4de1bd4 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -976,7 +976,7 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
         `)`
       | `vector_length` `(` custom<DeviceTypeOperands>($vectorLength,
             type($vectorLength), $vectorLengthDeviceType) `)`
-      | `wait` `(` custom<WaitOperands>($waitOperands,
+      | `wait` `(` custom<DeviceTypeOperandsWithSegment>($waitOperands,
             type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
@@ -1075,7 +1075,7 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
       | `private` `(` custom<SymOperandList>(
             $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
         `)`
-      | `wait` `(` custom<WaitOperands>($waitOperands,
+      | `wait` `(` custom<DeviceTypeOperandsWithSegment>($waitOperands,
             type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
@@ -1196,8 +1196,9 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
             type($numWorkers), $numWorkersDeviceType) `)`
       | `vector_length` `(` custom<DeviceTypeOperands>($vectorLength,
             type($vectorLength), $vectorLengthDeviceType) `)`
-      | `wait` `(` custom<WaitOperands>($waitOperands,
-            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
+      | `wait` `(` custom<DeviceTypeOperandsWithSegment>($waitOperands,
+            type($waitOperands), $waitOperandsDeviceType,
+            $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
     )
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index f484eda3268db..e299b67b10a9c 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -874,7 +874,7 @@ static void printNumGangs(mlir::OpAsmPrinter &p, mlir::Operation *op,
   }
 }
 
-static ParseResult parseWaitOperands(
+static ParseResult parseDeviceTypeOperandsWithSegment(
     mlir::OpAsmParser &parser,
     llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
     llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes,
@@ -918,11 +918,10 @@ static ParseResult parseWaitOperands(
   return success();
 }
 
-static void printWaitOperands(mlir::OpAsmPrinter &p, mlir::Operation *op,
-                              mlir::OperandRange operands,
-                              mlir::TypeRange types,
-                              std::optional<mlir::ArrayAttr> deviceTypes,
-                              std::optional<mlir::DenseI32ArrayAttr> segments) {
+static void printDeviceTypeOperandsWithSegment(
+    mlir::OpAsmPrinter &p, mlir::Operation *op, mlir::OperandRange operands,
+    mlir::TypeRange types, std::optional<mlir::ArrayAttr> deviceTypes,
+    std::optional<mlir::DenseI32ArrayAttr> segments) {
   unsigned opIdx = 0;
   for (unsigned i = 0; i < deviceTypes->size(); ++i) {
     if (i != 0)

From 8b2bdfbca7c1db272e4e703445f5626b4bc4b9d3 Mon Sep 17 00:00:00 2001
From: Alan Phipps <a-phipps@ti.com>
Date: Mon, 1 Jan 2024 11:40:55 -0600
Subject: [PATCH 266/313] [Coverage][clang] Enable MC/DC Support in LLVM
 Source-based Code Coverage (3/3)

Part 3 of 3. This includes the MC/DC clang front-end components.

Differential Revision: https://reviews.llvm.org/D138849
---
 clang/include/clang/Basic/CodeGenOptions.def  |   1 +
 clang/include/clang/Driver/Options.td         |   6 +
 clang/lib/CodeGen/CGClass.cpp                 |   6 +-
 clang/lib/CodeGen/CGExprScalar.cpp            |  57 +++
 clang/lib/CodeGen/CGStmt.cpp                  |  14 +-
 clang/lib/CodeGen/CodeGenFunction.cpp         |  98 +++--
 clang/lib/CodeGen/CodeGenFunction.h           |  58 ++-
 clang/lib/CodeGen/CodeGenPGO.cpp              | 263 +++++++++++-
 clang/lib/CodeGen/CodeGenPGO.h                |  14 +-
 clang/lib/CodeGen/CoverageMappingGen.cpp      | 405 ++++++++++++++++--
 clang/lib/CodeGen/CoverageMappingGen.h        |  12 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  11 +
 .../CoverageMapping/branch-constfolded.cpp    |  42 +-
 .../CoverageMapping/branch-mincounters.cpp    |   1 +
 .../test/CoverageMapping/branch-templates.cpp |   1 +
 clang/test/CoverageMapping/if.cpp             |   1 +
 clang/test/CoverageMapping/logical.cpp        |  10 +-
 clang/test/CoverageMapping/mcdc-class.cpp     |  31 ++
 .../CoverageMapping/mcdc-error-conditions.cpp |   7 +
 .../test/CoverageMapping/mcdc-error-nests.cpp |  10 +
 .../mcdc-logical-scalar-ids.cpp               | 110 +++++
 .../mcdc-logical-stmt-ids-all.cpp             | 131 ++++++
 .../CoverageMapping/mcdc-logical-stmt-ids.cpp | 111 +++++
 .../Profile/c-linkage-available_externally.c  |   2 +
 clang/test/Profile/c-mcdc-class.cpp           | 104 +++++
 clang/test/Profile/c-mcdc-nested-ternary.c    |  68 +++
 clang/test/Profile/c-mcdc-not.c               |  88 ++++
 clang/test/Profile/c-mcdc.c                   | 102 +++++
 .../ContinuousSyncMode/image-with-mcdc.c      |  26 ++
 29 files changed, 1700 insertions(+), 90 deletions(-)
 create mode 100644 clang/test/CoverageMapping/mcdc-class.cpp
 create mode 100644 clang/test/CoverageMapping/mcdc-error-conditions.cpp
 create mode 100644 clang/test/CoverageMapping/mcdc-error-nests.cpp
 create mode 100644 clang/test/CoverageMapping/mcdc-logical-scalar-ids.cpp
 create mode 100644 clang/test/CoverageMapping/mcdc-logical-stmt-ids-all.cpp
 create mode 100644 clang/test/CoverageMapping/mcdc-logical-stmt-ids.cpp
 create mode 100644 clang/test/Profile/c-mcdc-class.cpp
 create mode 100644 clang/test/Profile/c-mcdc-nested-ternary.c
 create mode 100644 clang/test/Profile/c-mcdc-not.c
 create mode 100644 clang/test/Profile/c-mcdc.c
 create mode 100644 compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 0acb5ae134ea2..2c4fb6745bc17 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -209,6 +209,7 @@ CODEGENOPT(CoverageMapping , 1, 0) ///< Generate coverage mapping regions to
                                    ///< enable code coverage analysis.
 CODEGENOPT(DumpCoverageMapping , 1, 0) ///< Dump the generated coverage mapping
                                        ///< regions.
+CODEGENOPT(MCDCCoverage , 1, 0) ///< Enable MC/DC code coverage criteria.
 
   /// If -fpcc-struct-return or -freg-struct-return is specified.
 ENUM_CODEGENOPT(StructReturnConvention, StructReturnConventionKind, 2, SRCK_Default)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2b93ddf033499..6aff37f133687 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1695,6 +1695,12 @@ defm coverage_mapping : BoolFOption<"coverage-mapping",
           "Generate coverage mapping to enable code coverage analysis">,
   NegFlag<SetFalse, [], [ClangOption], "Disable code coverage analysis">, BothFlags<
           [], [ClangOption, CLOption]>>;
+defm mcdc_coverage : BoolFOption<"coverage-mcdc",
+  CodeGenOpts<"MCDCCoverage">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+          "Enable MC/DC criteria when generating code coverage">,
+  NegFlag<SetFalse, [], [ClangOption], "Disable MC/DC coverage criteria">,
+          BothFlags<[], [ClangOption, CLOption]>>;
 def fprofile_generate : Flag<["-"], "fprofile-generate">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>,
     HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index d18f186ce5b41..34319381901af 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -856,6 +856,7 @@ void CodeGenFunction::EmitConstructorBody(FunctionArgList &Args) {
     EnterCXXTryStmt(*cast<CXXTryStmt>(Body), true);
 
   incrementProfileCounter(Body);
+  maybeCreateMCDCCondBitmap();
 
   RunCleanupsScope RunCleanups(*this);
 
@@ -1444,8 +1445,10 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) {
   }
 
   Stmt *Body = Dtor->getBody();
-  if (Body)
+  if (Body) {
     incrementProfileCounter(Body);
+    maybeCreateMCDCCondBitmap();
+  }
 
   // The call to operator delete in a deleting destructor happens
   // outside of the function-try-block, which means it's always
@@ -1548,6 +1551,7 @@ void CodeGenFunction::emitImplicitAssignmentOperatorBody(FunctionArgList &Args)
   LexicalScope Scope(*this, RootCS->getSourceRange());
 
   incrementProfileCounter(RootCS);
+  maybeCreateMCDCCondBitmap();
   AssignmentMemcpyizer AM(*this, AssignOp, Args);
   for (auto *I : RootCS->body())
     AM.emitAssignment(I);
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index d2c4c7ee50bc8..9ec185153d12b 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -4564,6 +4564,12 @@ Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
     if (LHSCondVal) { // If we have 1 && X, just emit X.
       CGF.incrementProfileCounter(E);
 
+      // If the top of the logical operator nest, reset the MCDC temp to 0.
+      if (CGF.MCDCLogOpStack.empty())
+        CGF.maybeResetMCDCCondBitmap(E);
+
+      CGF.MCDCLogOpStack.push_back(E);
+
       Value *RHSCond = CGF.EvaluateExprAsBool(E->getRHS());
 
       // If we're generating for profiling or coverage, generate a branch to a
@@ -4572,6 +4578,7 @@ Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
       // "FalseBlock" after the increment is done.
       if (InstrumentRegions &&
           CodeGenFunction::isInstrumentedCondition(E->getRHS())) {
+        CGF.maybeUpdateMCDCCondBitmap(E->getRHS(), RHSCond);
         llvm::BasicBlock *FBlock = CGF.createBasicBlock("land.end");
         llvm::BasicBlock *RHSBlockCnt = CGF.createBasicBlock("land.rhscnt");
         Builder.CreateCondBr(RHSCond, RHSBlockCnt, FBlock);
@@ -4581,6 +4588,11 @@ Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
         CGF.EmitBlock(FBlock);
       }
 
+      CGF.MCDCLogOpStack.pop_back();
+      // If the top of the logical operator nest, update the MCDC bitmap.
+      if (CGF.MCDCLogOpStack.empty())
+        CGF.maybeUpdateMCDCTestVectorBitmap(E);
+
       // ZExt result to int or bool.
       return Builder.CreateZExtOrBitCast(RHSCond, ResTy, "land.ext");
     }
@@ -4590,6 +4602,12 @@ Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
       return llvm::Constant::getNullValue(ResTy);
   }
 
+  // If the top of the logical operator nest, reset the MCDC temp to 0.
+  if (CGF.MCDCLogOpStack.empty())
+    CGF.maybeResetMCDCCondBitmap(E);
+
+  CGF.MCDCLogOpStack.push_back(E);
+
   llvm::BasicBlock *ContBlock = CGF.createBasicBlock("land.end");
   llvm::BasicBlock *RHSBlock  = CGF.createBasicBlock("land.rhs");
 
@@ -4622,6 +4640,7 @@ Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
   // condition coverage.
   if (InstrumentRegions &&
       CodeGenFunction::isInstrumentedCondition(E->getRHS())) {
+    CGF.maybeUpdateMCDCCondBitmap(E->getRHS(), RHSCond);
     llvm::BasicBlock *RHSBlockCnt = CGF.createBasicBlock("land.rhscnt");
     Builder.CreateCondBr(RHSCond, RHSBlockCnt, ContBlock);
     CGF.EmitBlock(RHSBlockCnt);
@@ -4639,6 +4658,11 @@ Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
   // Insert an entry into the phi node for the edge with the value of RHSCond.
   PN->addIncoming(RHSCond, RHSBlock);
 
+  CGF.MCDCLogOpStack.pop_back();
+  // If the top of the logical operator nest, update the MCDC bitmap.
+  if (CGF.MCDCLogOpStack.empty())
+    CGF.maybeUpdateMCDCTestVectorBitmap(E);
+
   // Artificial location to preserve the scope information
   {
     auto NL = ApplyDebugLocation::CreateArtificial(CGF);
@@ -4680,6 +4704,12 @@ Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
     if (!LHSCondVal) { // If we have 0 || X, just emit X.
       CGF.incrementProfileCounter(E);
 
+      // If the top of the logical operator nest, reset the MCDC temp to 0.
+      if (CGF.MCDCLogOpStack.empty())
+        CGF.maybeResetMCDCCondBitmap(E);
+
+      CGF.MCDCLogOpStack.push_back(E);
+
       Value *RHSCond = CGF.EvaluateExprAsBool(E->getRHS());
 
       // If we're generating for profiling or coverage, generate a branch to a
@@ -4688,6 +4718,7 @@ Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
       // "FalseBlock" after the increment is done.
       if (InstrumentRegions &&
           CodeGenFunction::isInstrumentedCondition(E->getRHS())) {
+        CGF.maybeUpdateMCDCCondBitmap(E->getRHS(), RHSCond);
         llvm::BasicBlock *FBlock = CGF.createBasicBlock("lor.end");
         llvm::BasicBlock *RHSBlockCnt = CGF.createBasicBlock("lor.rhscnt");
         Builder.CreateCondBr(RHSCond, FBlock, RHSBlockCnt);
@@ -4697,6 +4728,11 @@ Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
         CGF.EmitBlock(FBlock);
       }
 
+      CGF.MCDCLogOpStack.pop_back();
+      // If the top of the logical operator nest, update the MCDC bitmap.
+      if (CGF.MCDCLogOpStack.empty())
+        CGF.maybeUpdateMCDCTestVectorBitmap(E);
+
       // ZExt result to int or bool.
       return Builder.CreateZExtOrBitCast(RHSCond, ResTy, "lor.ext");
     }
@@ -4706,6 +4742,12 @@ Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
       return llvm::ConstantInt::get(ResTy, 1);
   }
 
+  // If the top of the logical operator nest, reset the MCDC temp to 0.
+  if (CGF.MCDCLogOpStack.empty())
+    CGF.maybeResetMCDCCondBitmap(E);
+
+  CGF.MCDCLogOpStack.push_back(E);
+
   llvm::BasicBlock *ContBlock = CGF.createBasicBlock("lor.end");
   llvm::BasicBlock *RHSBlock = CGF.createBasicBlock("lor.rhs");
 
@@ -4742,6 +4784,7 @@ Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
   // condition coverage.
   if (InstrumentRegions &&
       CodeGenFunction::isInstrumentedCondition(E->getRHS())) {
+    CGF.maybeUpdateMCDCCondBitmap(E->getRHS(), RHSCond);
     llvm::BasicBlock *RHSBlockCnt = CGF.createBasicBlock("lor.rhscnt");
     Builder.CreateCondBr(RHSCond, ContBlock, RHSBlockCnt);
     CGF.EmitBlock(RHSBlockCnt);
@@ -4755,6 +4798,11 @@ Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
   CGF.EmitBlock(ContBlock);
   PN->addIncoming(RHSCond, RHSBlock);
 
+  CGF.MCDCLogOpStack.pop_back();
+  // If the top of the logical operator nest, update the MCDC bitmap.
+  if (CGF.MCDCLogOpStack.empty())
+    CGF.maybeUpdateMCDCTestVectorBitmap(E);
+
   // ZExt result to int.
   return Builder.CreateZExtOrBitCast(PN, ResTy, "lor.ext");
 }
@@ -4899,6 +4947,10 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
     return Builder.CreateSelect(CondV, LHS, RHS, "cond");
   }
 
+  // If the top of the logical operator nest, reset the MCDC temp to 0.
+  if (CGF.MCDCLogOpStack.empty())
+    CGF.maybeResetMCDCCondBitmap(condExpr);
+
   llvm::BasicBlock *LHSBlock = CGF.createBasicBlock("cond.true");
   llvm::BasicBlock *RHSBlock = CGF.createBasicBlock("cond.false");
   llvm::BasicBlock *ContBlock = CGF.createBasicBlock("cond.end");
@@ -4934,6 +4986,11 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   llvm::PHINode *PN = Builder.CreatePHI(LHS->getType(), 2, "cond");
   PN->addIncoming(LHS, LHSBlock);
   PN->addIncoming(RHS, RHSBlock);
+
+  // If the top of the logical operator nest, update the MCDC bitmap.
+  if (CGF.MCDCLogOpStack.empty())
+    CGF.maybeUpdateMCDCTestVectorBitmap(condExpr);
+
   return PN;
 }
 
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 0f79a2e861d22..b89017de0bcf1 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -837,7 +837,19 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
   if (!ThenCount && !getCurrentProfileCount() &&
       CGM.getCodeGenOpts().OptimizationLevel)
     LH = Stmt::getLikelihood(S.getThen(), S.getElse());
-  EmitBranchOnBoolExpr(S.getCond(), ThenBlock, ElseBlock, ThenCount, LH);
+
+  // When measuring MC/DC, always fully evaluate the condition up front using
+  // EvaluateExprAsBool() so that the test vector bitmap can be updated prior to
+  // executing the body of the if.then or if.else. This is useful for when
+  // there is a 'return' within the body, but this is particularly beneficial
+  // when one if-stmt is nested within another if-stmt so that all of the MC/DC
+  // updates are kept linear and consistent.
+  if (!CGM.getCodeGenOpts().MCDCCoverage)
+    EmitBranchOnBoolExpr(S.getCond(), ThenBlock, ElseBlock, ThenCount, LH);
+  else {
+    llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
+    Builder.CreateCondBr(BoolCondVal, ThenBlock, ElseBlock);
+  }
 
   // Emit the 'then' code.
   EmitBlock(ThenBlock);
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 2199d7b58fb96..2673e4a5cee7b 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1256,6 +1256,7 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
 
 void CodeGenFunction::EmitFunctionBody(const Stmt *Body) {
   incrementProfileCounter(Body);
+  maybeCreateMCDCCondBitmap();
   if (const CompoundStmt *S = dyn_cast<CompoundStmt>(Body))
     EmitCompoundStmtWithoutScope(*S);
   else
@@ -1601,6 +1602,13 @@ bool CodeGenFunction::mightAddDeclToScope(const Stmt *S) {
 bool CodeGenFunction::ConstantFoldsToSimpleInteger(const Expr *Cond,
                                                    bool &ResultBool,
                                                    bool AllowLabels) {
+  // If MC/DC is enabled, disable folding so that we can instrument all
+  // conditions to yield complete test vectors. We still keep track of
+  // folded conditions during region mapping and visualization.
+  if (!AllowLabels && CGM.getCodeGenOpts().hasProfileClangInstr() &&
+      CGM.getCodeGenOpts().MCDCCoverage)
+    return false;
+
   llvm::APSInt ResultInt;
   if (!ConstantFoldsToSimpleInteger(Cond, ResultInt, AllowLabels))
     return false;
@@ -1629,16 +1637,20 @@ bool CodeGenFunction::ConstantFoldsToSimpleInteger(const Expr *Cond,
   return true;
 }
 
+/// Strip parentheses and simplistic logical-NOT operators.
+const Expr *CodeGenFunction::stripCond(const Expr *C) {
+  while (const UnaryOperator *Op = dyn_cast<UnaryOperator>(C->IgnoreParens())) {
+    if (Op->getOpcode() != UO_LNot)
+      break;
+    C = Op->getSubExpr();
+  }
+  return C->IgnoreParens();
+}
+
 /// Determine whether the given condition is an instrumentable condition
 /// (i.e. no "&&" or "||").
 bool CodeGenFunction::isInstrumentedCondition(const Expr *C) {
-  // Bypass simplistic logical-NOT operator before determining whether the
-  // condition contains any other logical operator.
-  if (const UnaryOperator *UnOp = dyn_cast<UnaryOperator>(C->IgnoreParens()))
-    if (UnOp->getOpcode() == UO_LNot)
-      C = UnOp->getSubExpr();
-
-  const BinaryOperator *BOp = dyn_cast<BinaryOperator>(C->IgnoreParens());
+  const BinaryOperator *BOp = dyn_cast<BinaryOperator>(stripCond(C));
   return (!BOp || !BOp->isLogicalOp());
 }
 
@@ -1717,17 +1729,19 @@ void CodeGenFunction::EmitBranchToCounterBlock(
 /// statement) to the specified blocks.  Based on the condition, this might try
 /// to simplify the codegen of the conditional based on the branch.
 /// \param LH The value of the likelihood attribute on the True branch.
-void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
-                                           llvm::BasicBlock *TrueBlock,
-                                           llvm::BasicBlock *FalseBlock,
-                                           uint64_t TrueCount,
-                                           Stmt::Likelihood LH) {
+/// \param ConditionalOp Used by MC/DC code coverage to track the result of the
+/// ConditionalOperator (ternary) through a recursive call for the operator's
+/// LHS and RHS nodes.
+void CodeGenFunction::EmitBranchOnBoolExpr(
+    const Expr *Cond, llvm::BasicBlock *TrueBlock, llvm::BasicBlock *FalseBlock,
+    uint64_t TrueCount, Stmt::Likelihood LH, const Expr *ConditionalOp) {
   Cond = Cond->IgnoreParens();
 
   if (const BinaryOperator *CondBOp = dyn_cast<BinaryOperator>(Cond)) {
-
     // Handle X && Y in a condition.
     if (CondBOp->getOpcode() == BO_LAnd) {
+      MCDCLogOpStack.push_back(CondBOp);
+
       // If we have "1 && X", simplify the code.  "0 && X" would have constant
       // folded if the case was simple enough.
       bool ConstantBool = false;
@@ -1735,8 +1749,10 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
           ConstantBool) {
         // br(1 && X) -> br(X).
         incrementProfileCounter(CondBOp);
-        return EmitBranchToCounterBlock(CondBOp->getRHS(), BO_LAnd, TrueBlock,
-                                        FalseBlock, TrueCount, LH);
+        EmitBranchToCounterBlock(CondBOp->getRHS(), BO_LAnd, TrueBlock,
+                                 FalseBlock, TrueCount, LH);
+        MCDCLogOpStack.pop_back();
+        return;
       }
 
       // If we have "X && 1", simplify the code to use an uncond branch.
@@ -1744,8 +1760,10 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
       if (ConstantFoldsToSimpleInteger(CondBOp->getRHS(), ConstantBool) &&
           ConstantBool) {
         // br(X && 1) -> br(X).
-        return EmitBranchToCounterBlock(CondBOp->getLHS(), BO_LAnd, TrueBlock,
-                                        FalseBlock, TrueCount, LH, CondBOp);
+        EmitBranchToCounterBlock(CondBOp->getLHS(), BO_LAnd, TrueBlock,
+                                 FalseBlock, TrueCount, LH, CondBOp);
+        MCDCLogOpStack.pop_back();
+        return;
       }
 
       // Emit the LHS as a conditional.  If the LHS conditional is false, we
@@ -1774,11 +1792,13 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
       EmitBranchToCounterBlock(CondBOp->getRHS(), BO_LAnd, TrueBlock,
                                FalseBlock, TrueCount, LH);
       eval.end(*this);
-
+      MCDCLogOpStack.pop_back();
       return;
     }
 
     if (CondBOp->getOpcode() == BO_LOr) {
+      MCDCLogOpStack.push_back(CondBOp);
+
       // If we have "0 || X", simplify the code.  "1 || X" would have constant
       // folded if the case was simple enough.
       bool ConstantBool = false;
@@ -1786,8 +1806,10 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
           !ConstantBool) {
         // br(0 || X) -> br(X).
         incrementProfileCounter(CondBOp);
-        return EmitBranchToCounterBlock(CondBOp->getRHS(), BO_LOr, TrueBlock,
-                                        FalseBlock, TrueCount, LH);
+        EmitBranchToCounterBlock(CondBOp->getRHS(), BO_LOr, TrueBlock,
+                                 FalseBlock, TrueCount, LH);
+        MCDCLogOpStack.pop_back();
+        return;
       }
 
       // If we have "X || 0", simplify the code to use an uncond branch.
@@ -1795,10 +1817,11 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
       if (ConstantFoldsToSimpleInteger(CondBOp->getRHS(), ConstantBool) &&
           !ConstantBool) {
         // br(X || 0) -> br(X).
-        return EmitBranchToCounterBlock(CondBOp->getLHS(), BO_LOr, TrueBlock,
-                                        FalseBlock, TrueCount, LH, CondBOp);
+        EmitBranchToCounterBlock(CondBOp->getLHS(), BO_LOr, TrueBlock,
+                                 FalseBlock, TrueCount, LH, CondBOp);
+        MCDCLogOpStack.pop_back();
+        return;
       }
-
       // Emit the LHS as a conditional.  If the LHS conditional is true, we
       // want to jump to the TrueBlock.
       llvm::BasicBlock *LHSFalse = createBasicBlock("lor.lhs.false");
@@ -1829,14 +1852,20 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
                                RHSCount, LH);
 
       eval.end(*this);
-
+      MCDCLogOpStack.pop_back();
       return;
     }
   }
 
   if (const UnaryOperator *CondUOp = dyn_cast<UnaryOperator>(Cond)) {
     // br(!x, t, f) -> br(x, f, t)
-    if (CondUOp->getOpcode() == UO_LNot) {
+    // Avoid doing this optimization when instrumenting a condition for MC/DC.
+    // LNot is taken as part of the condition for simplicity, and changing its
+    // sense negatively impacts test vector tracking.
+    bool MCDCCondition = CGM.getCodeGenOpts().hasProfileClangInstr() &&
+                         CGM.getCodeGenOpts().MCDCCoverage &&
+                         isInstrumentedCondition(Cond);
+    if (CondUOp->getOpcode() == UO_LNot && !MCDCCondition) {
       // Negate the count.
       uint64_t FalseCount = getCurrentProfileCount() - TrueCount;
       // The values of the enum are chosen to make this negation possible.
@@ -1876,14 +1905,14 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
     {
       ApplyDebugLocation DL(*this, Cond);
       EmitBranchOnBoolExpr(CondOp->getLHS(), TrueBlock, FalseBlock,
-                           LHSScaledTrueCount, LH);
+                           LHSScaledTrueCount, LH, CondOp);
     }
     cond.end(*this);
 
     cond.begin(*this);
     EmitBlock(RHSBlock);
     EmitBranchOnBoolExpr(CondOp->getRHS(), TrueBlock, FalseBlock,
-                         TrueCount - LHSScaledTrueCount, LH);
+                         TrueCount - LHSScaledTrueCount, LH, CondOp);
     cond.end(*this);
 
     return;
@@ -1906,6 +1935,21 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
     CondV = EvaluateExprAsBool(Cond);
   }
 
+  // If not at the top of the logical operator nest, update MCDC temp with the
+  // boolean result of the evaluated condition.
+  if (!MCDCLogOpStack.empty()) {
+    const Expr *MCDCBaseExpr = Cond;
+    // When a nested ConditionalOperator (ternary) is encountered in a boolean
+    // expression, MC/DC tracks the result of the ternary, and this is tied to
+    // the ConditionalOperator expression and not the ternary's LHS or RHS. If
+    // this is the case, the ConditionalOperator expression is passed through
+    // the ConditionalOp parameter and then used as the MCDC base expression.
+    if (ConditionalOp)
+      MCDCBaseExpr = ConditionalOp;
+
+    maybeUpdateMCDCCondBitmap(MCDCBaseExpr, CondV);
+  }
+
   llvm::MDNode *Weights = nullptr;
   llvm::MDNode *Unpredictable = nullptr;
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 751d8110b13df..07c7678df87eb 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -287,6 +287,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// nest would extend.
   SmallVector<llvm::CanonicalLoopInfo *, 4> OMPLoopNestStack;
 
+  /// Stack to track the Logical Operator recursion nest for MC/DC.
+  SmallVector<const BinaryOperator *, 16> MCDCLogOpStack;
+
   /// Number of nested loop to be consumed by the last surrounding
   /// loop-associated directive.
   int ExpectedOMPLoopDepth = 0;
@@ -1521,6 +1524,9 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   CodeGenPGO PGO;
 
+  /// Bitmap used by MC/DC to track condition outcomes of a boolean expression.
+  Address MCDCCondBitmapAddr = Address::invalid();
+
   /// Calculate branch weights appropriate for PGO data
   llvm::MDNode *createProfileWeights(uint64_t TrueCount,
                                      uint64_t FalseCount) const;
@@ -1539,6 +1545,52 @@ class CodeGenFunction : public CodeGenTypeCache {
     PGO.setCurrentStmt(S);
   }
 
+  bool isMCDCCoverageEnabled() const {
+    return (CGM.getCodeGenOpts().hasProfileClangInstr() &&
+            CGM.getCodeGenOpts().MCDCCoverage &&
+            !CurFn->hasFnAttribute(llvm::Attribute::NoProfile));
+  }
+
+  /// Allocate a temp value on the stack that MCDC can use to track condition
+  /// results.
+  void maybeCreateMCDCCondBitmap() {
+    if (isMCDCCoverageEnabled()) {
+      PGO.emitMCDCParameters(Builder);
+      MCDCCondBitmapAddr =
+          CreateIRTemp(getContext().UnsignedIntTy, "mcdc.addr");
+    }
+  }
+
+  bool isBinaryLogicalOp(const Expr *E) const {
+    const BinaryOperator *BOp = dyn_cast<BinaryOperator>(E->IgnoreParens());
+    return (BOp && BOp->isLogicalOp());
+  }
+
+  /// Zero-init the MCDC temp value.
+  void maybeResetMCDCCondBitmap(const Expr *E) {
+    if (isMCDCCoverageEnabled() && isBinaryLogicalOp(E)) {
+      PGO.emitMCDCCondBitmapReset(Builder, E, MCDCCondBitmapAddr);
+      PGO.setCurrentStmt(E);
+    }
+  }
+
+  /// Increment the profiler's counter for the given expression by \p StepV.
+  /// If \p StepV is null, the default increment is 1.
+  void maybeUpdateMCDCTestVectorBitmap(const Expr *E) {
+    if (isMCDCCoverageEnabled() && isBinaryLogicalOp(E)) {
+      PGO.emitMCDCTestVectorBitmapUpdate(Builder, E, MCDCCondBitmapAddr);
+      PGO.setCurrentStmt(E);
+    }
+  }
+
+  /// Update the MCDC temp value with the condition's evaluated result.
+  void maybeUpdateMCDCCondBitmap(const Expr *E, llvm::Value *Val) {
+    if (isMCDCCoverageEnabled()) {
+      PGO.emitMCDCCondBitmapUpdate(Builder, E, MCDCCondBitmapAddr, Val);
+      PGO.setCurrentStmt(E);
+    }
+  }
+
   /// Get the profiler's count for the given statement.
   uint64_t getProfileCount(const Stmt *S) {
     return PGO.getStmtCount(S).value_or(0);
@@ -4626,6 +4678,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   bool ConstantFoldsToSimpleInteger(const Expr *Cond, llvm::APSInt &Result,
                                     bool AllowLabels = false);
 
+  /// Ignore parentheses and logical-NOT to track conditions consistently.
+  static const Expr *stripCond(const Expr *C);
+
   /// isInstrumentedCondition - Determine whether the given condition is an
   /// instrumentable condition (i.e. no "&&" or "||").
   static bool isInstrumentedCondition(const Expr *C);
@@ -4648,7 +4703,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// evaluate to true based on PGO data.
   void EmitBranchOnBoolExpr(const Expr *Cond, llvm::BasicBlock *TrueBlock,
                             llvm::BasicBlock *FalseBlock, uint64_t TrueCount,
-                            Stmt::Likelihood LH = Stmt::LH_None);
+                            Stmt::Likelihood LH = Stmt::LH_None,
+                            const Expr *ConditionalOp = nullptr);
 
   /// Given an assignment `*LHS = RHS`, emit a test that checks if \p RHS is
   /// nonnull, if \p LHS is marked _Nonnull.
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 81bf8ea696b16..d68844d476eb4 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -161,13 +161,24 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
   PGOHash Hash;
   /// The map of statements to counters.
   llvm::DenseMap<const Stmt *, unsigned> &CounterMap;
+  /// The next bitmap byte index to assign.
+  unsigned NextMCDCBitmapIdx;
+  /// The map of statements to MC/DC bitmap coverage objects.
+  llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap;
+  /// Maximum number of supported MC/DC conditions in a boolean expression.
+  unsigned MCDCMaxCond;
   /// The profile version.
   uint64_t ProfileVersion;
+  /// Diagnostics Engine used to report warnings.
+  DiagnosticsEngine &Diag;
 
   MapRegionCounters(PGOHashVersion HashVersion, uint64_t ProfileVersion,
-                    llvm::DenseMap<const Stmt *, unsigned> &CounterMap)
+                    llvm::DenseMap<const Stmt *, unsigned> &CounterMap,
+                    llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap,
+                    unsigned MCDCMaxCond, DiagnosticsEngine &Diag)
       : NextCounter(0), Hash(HashVersion), CounterMap(CounterMap),
-        ProfileVersion(ProfileVersion) {}
+        NextMCDCBitmapIdx(0), MCDCBitmapMap(MCDCBitmapMap),
+        MCDCMaxCond(MCDCMaxCond), ProfileVersion(ProfileVersion), Diag(Diag) {}
 
   // Blocks and lambdas are handled as separate functions, so we need not
   // traverse them in the parent context.
@@ -207,15 +218,126 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     return Type;
   }
 
+  /// The following stacks are used with dataTraverseStmtPre() and
+  /// dataTraverseStmtPost() to track the depth of nested logical operators in a
+  /// boolean expression in a function.  The ultimate purpose is to keep track
+  /// of the number of leaf-level conditions in the boolean expression so that a
+  /// profile bitmap can be allocated based on that number.
+  ///
+  /// The stacks are also used to find error cases and notify the user.  A
+  /// standard logical operator nest for a boolean expression could be in a form
+  /// similar to this: "x = a && b && c && (d || f)"
+  unsigned NumCond = 0;
+  bool SplitNestedLogicalOp = false;
+  SmallVector<const Stmt *, 16> NonLogOpStack;
+  SmallVector<const BinaryOperator *, 16> LogOpStack;
+
+  // Hook: dataTraverseStmtPre() is invoked prior to visiting an AST Stmt node.
+  bool dataTraverseStmtPre(Stmt *S) {
+    /// If MC/DC is not enabled, MCDCMaxCond will be set to 0. Do nothing.
+    if (MCDCMaxCond == 0)
+      return true;
+
+    /// At the top of the logical operator nest, reset the number of conditions.
+    if (LogOpStack.empty())
+      NumCond = 0;
+
+    if (const Expr *E = dyn_cast<Expr>(S)) {
+      const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(E->IgnoreParens());
+      if (BinOp && BinOp->isLogicalOp()) {
+        /// Check for "split-nested" logical operators. This happens when a new
+        /// boolean expression logical-op nest is encountered within an existing
+        /// boolean expression, separated by a non-logical operator.  For
+        /// example, in "x = (a && b && c && foo(d && f))", the "d && f" case
+        /// starts a new boolean expression that is separated from the other
+        /// conditions by the operator foo(). Split-nested cases are not
+        /// supported by MC/DC.
+        SplitNestedLogicalOp = SplitNestedLogicalOp || !NonLogOpStack.empty();
+
+        LogOpStack.push_back(BinOp);
+        return true;
+      }
+    }
+
+    /// Keep track of non-logical operators. These are OK as long as we don't
+    /// encounter a new logical operator after seeing one.
+    if (!LogOpStack.empty())
+      NonLogOpStack.push_back(S);
+
+    return true;
+  }
+
+  // Hook: dataTraverseStmtPost() is invoked by the AST visitor after visiting
+  // an AST Stmt node.  MC/DC will use it to to signal when the top of a
+  // logical operation (boolean expression) nest is encountered.
+  bool dataTraverseStmtPost(Stmt *S) {
+    /// If MC/DC is not enabled, MCDCMaxCond will be set to 0. Do nothing.
+    if (MCDCMaxCond == 0)
+      return true;
+
+    if (const Expr *E = dyn_cast<Expr>(S)) {
+      const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(E->IgnoreParens());
+      if (BinOp && BinOp->isLogicalOp()) {
+        assert(LogOpStack.back() == BinOp);
+        LogOpStack.pop_back();
+
+        /// At the top of logical operator nest:
+        if (LogOpStack.empty()) {
+          /// Was the "split-nested" logical operator case encountered?
+          if (SplitNestedLogicalOp) {
+            unsigned DiagID = Diag.getCustomDiagID(
+                DiagnosticsEngine::Warning,
+                "unsupported MC/DC boolean expression; "
+                "contains an operation with a nested boolean expression. "
+                "Expression will not be covered");
+            Diag.Report(S->getBeginLoc(), DiagID);
+            return false;
+          }
+
+          /// Was the maximum number of conditions encountered?
+          if (NumCond > MCDCMaxCond) {
+            unsigned DiagID = Diag.getCustomDiagID(
+                DiagnosticsEngine::Warning,
+                "unsupported MC/DC boolean expression; "
+                "number of conditions (%0) exceeds max (%1). "
+                "Expression will not be covered");
+            Diag.Report(S->getBeginLoc(), DiagID) << NumCond << MCDCMaxCond;
+            return false;
+          }
+
+          // Otherwise, allocate the number of bytes required for the bitmap
+          // based on the number of conditions. Must be at least 1-byte long.
+          MCDCBitmapMap[BinOp] = NextMCDCBitmapIdx;
+          unsigned SizeInBits = std::max<unsigned>(1L << NumCond, CHAR_BIT);
+          NextMCDCBitmapIdx += SizeInBits / CHAR_BIT;
+        }
+        return true;
+      }
+    }
+
+    if (!LogOpStack.empty())
+      NonLogOpStack.pop_back();
+
+    return true;
+  }
+
   /// The RHS of all logical operators gets a fresh counter in order to count
   /// how many times the RHS evaluates to true or false, depending on the
   /// semantics of the operator. This is only valid for ">= v7" of the profile
-  /// version so that we facilitate backward compatibility.
+  /// version so that we facilitate backward compatibility. In addition, in
+  /// order to use MC/DC, count the number of total LHS and RHS conditions.
   bool VisitBinaryOperator(BinaryOperator *S) {
-    if (ProfileVersion >= llvm::IndexedInstrProf::Version7)
-      if (S->isLogicalOp() &&
-          CodeGenFunction::isInstrumentedCondition(S->getRHS()))
-        CounterMap[S->getRHS()] = NextCounter++;
+    if (S->isLogicalOp()) {
+      if (CodeGenFunction::isInstrumentedCondition(S->getLHS()))
+        NumCond++;
+
+      if (CodeGenFunction::isInstrumentedCondition(S->getRHS())) {
+        if (ProfileVersion >= llvm::IndexedInstrProf::Version7)
+          CounterMap[S->getRHS()] = NextCounter++;
+
+        NumCond++;
+      }
+    }
     return Base::VisitBinaryOperator(S);
   }
 
@@ -851,8 +973,22 @@ void CodeGenPGO::mapRegionCounters(const Decl *D) {
     ProfileVersion = PGOReader->getVersion();
   }
 
+  // If MC/DC is enabled, set the MaxConditions to a preset value. Otherwise,
+  // set it to zero. This value impacts the number of conditions accepted in a
+  // given boolean expression, which impacts the size of the bitmap used to
+  // track test vector execution for that boolean expression.  Because the
+  // bitmap scales exponentially (2^n) based on the number of conditions seen,
+  // the maximum value is hard-coded at 6 conditions, which is more than enough
+  // for most embedded applications. Setting a maximum value prevents the
+  // bitmap footprint from growing too large without the user's knowledge. In
+  // the future, this value could be adjusted with a command-line option.
+  unsigned MCDCMaxConditions = (CGM.getCodeGenOpts().MCDCCoverage) ? 6 : 0;
+
   RegionCounterMap.reset(new llvm::DenseMap<const Stmt *, unsigned>);
-  MapRegionCounters Walker(HashVersion, ProfileVersion, *RegionCounterMap);
+  RegionMCDCBitmapMap.reset(new llvm::DenseMap<const Stmt *, unsigned>);
+  MapRegionCounters Walker(HashVersion, ProfileVersion, *RegionCounterMap,
+                           *RegionMCDCBitmapMap, MCDCMaxConditions,
+                           CGM.getDiags());
   if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D))
     Walker.TraverseDecl(const_cast<FunctionDecl *>(FD));
   else if (const ObjCMethodDecl *MD = dyn_cast_or_null<ObjCMethodDecl>(D))
@@ -863,6 +999,7 @@ void CodeGenPGO::mapRegionCounters(const Decl *D) {
     Walker.TraverseDecl(const_cast<CapturedDecl *>(CD));
   assert(Walker.NextCounter > 0 && "no entry counter mapped for decl");
   NumRegionCounters = Walker.NextCounter;
+  MCDCBitmapBytes = Walker.NextMCDCBitmapIdx;
   FunctionHash = Walker.Hash.finalize();
 }
 
@@ -894,9 +1031,11 @@ void CodeGenPGO::emitCounterRegionMapping(const Decl *D) {
 
   std::string CoverageMapping;
   llvm::raw_string_ostream OS(CoverageMapping);
-  CoverageMappingGen MappingGen(*CGM.getCoverageMapping(),
-                                CGM.getContext().getSourceManager(),
-                                CGM.getLangOpts(), RegionCounterMap.get());
+  RegionCondIDMap.reset(new llvm::DenseMap<const Stmt *, unsigned>);
+  CoverageMappingGen MappingGen(
+      *CGM.getCoverageMapping(), CGM.getContext().getSourceManager(),
+      CGM.getLangOpts(), RegionCounterMap.get(), RegionMCDCBitmapMap.get(),
+      RegionCondIDMap.get());
   MappingGen.emitCounterMapping(D, OS);
   OS.flush();
 
@@ -972,6 +1111,108 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
         ArrayRef(Args));
 }
 
+bool CodeGenPGO::canEmitMCDCCoverage(const CGBuilderTy &Builder) {
+  return (CGM.getCodeGenOpts().hasProfileClangInstr() &&
+          CGM.getCodeGenOpts().MCDCCoverage && Builder.GetInsertBlock());
+}
+
+void CodeGenPGO::emitMCDCParameters(CGBuilderTy &Builder) {
+  if (!canEmitMCDCCoverage(Builder) || !RegionMCDCBitmapMap)
+    return;
+
+  auto *I8PtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext());
+
+  // Emit intrinsic representing MCDC bitmap parameters at function entry.
+  // This is used by the instrumentation pass, but it isn't actually lowered to
+  // anything.
+  llvm::Value *Args[3] = {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+                          Builder.getInt64(FunctionHash),
+                          Builder.getInt32(MCDCBitmapBytes)};
+  Builder.CreateCall(
+      CGM.getIntrinsic(llvm::Intrinsic::instrprof_mcdc_parameters), Args);
+}
+
+void CodeGenPGO::emitMCDCTestVectorBitmapUpdate(CGBuilderTy &Builder,
+                                                const Expr *S,
+                                                Address MCDCCondBitmapAddr) {
+  if (!canEmitMCDCCoverage(Builder) || !RegionMCDCBitmapMap)
+    return;
+
+  S = S->IgnoreParens();
+
+  auto ExprMCDCBitmapMapIterator = RegionMCDCBitmapMap->find(S);
+  if (ExprMCDCBitmapMapIterator == RegionMCDCBitmapMap->end())
+    return;
+
+  // Extract the ID of the global bitmap associated with this expression.
+  unsigned MCDCTestVectorBitmapID = ExprMCDCBitmapMapIterator->second;
+  auto *I8PtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext());
+
+  // Emit intrinsic responsible for updating the global bitmap corresponding to
+  // a boolean expression. The index being set is based on the value loaded
+  // from a pointer to a dedicated temporary value on the stack that is itself
+  // updated via emitMCDCCondBitmapReset() and emitMCDCCondBitmapUpdate(). The
+  // index represents an executed test vector.
+  llvm::Value *Args[5] = {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+                          Builder.getInt64(FunctionHash),
+                          Builder.getInt32(MCDCBitmapBytes),
+                          Builder.getInt32(MCDCTestVectorBitmapID),
+                          MCDCCondBitmapAddr.getPointer()};
+  Builder.CreateCall(
+      CGM.getIntrinsic(llvm::Intrinsic::instrprof_mcdc_tvbitmap_update), Args);
+}
+
+void CodeGenPGO::emitMCDCCondBitmapReset(CGBuilderTy &Builder, const Expr *S,
+                                         Address MCDCCondBitmapAddr) {
+  if (!canEmitMCDCCoverage(Builder) || !RegionMCDCBitmapMap)
+    return;
+
+  S = S->IgnoreParens();
+
+  if (RegionMCDCBitmapMap->find(S) == RegionMCDCBitmapMap->end())
+    return;
+
+  // Emit intrinsic that resets a dedicated temporary value on the stack to 0.
+  Builder.CreateStore(Builder.getInt32(0), MCDCCondBitmapAddr);
+}
+
+void CodeGenPGO::emitMCDCCondBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
+                                          Address MCDCCondBitmapAddr,
+                                          llvm::Value *Val) {
+  if (!canEmitMCDCCoverage(Builder) || !RegionCondIDMap)
+    return;
+
+  // Even though, for simplicity, parentheses and unary logical-NOT operators
+  // are considered part of their underlying condition for both MC/DC and
+  // branch coverage, the condition IDs themselves are assigned and tracked
+  // using the underlying condition itself.  This is done solely for
+  // consistency since parentheses and logical-NOTs are ignored when checking
+  // whether the condition is actually an instrumentable condition. This can
+  // also make debugging a bit easier.
+  S = CodeGenFunction::stripCond(S);
+
+  auto ExprMCDCConditionIDMapIterator = RegionCondIDMap->find(S);
+  if (ExprMCDCConditionIDMapIterator == RegionCondIDMap->end())
+    return;
+
+  // Extract the ID of the condition we are setting in the bitmap.
+  unsigned CondID = ExprMCDCConditionIDMapIterator->second;
+  assert(CondID > 0 && "Condition has no ID!");
+
+  auto *I8PtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext());
+
+  // Emit intrinsic that updates a dedicated temporary value on the stack after
+  // a condition is evaluated. After the set of conditions has been updated,
+  // the resulting value is used to update the boolean expression's bitmap.
+  llvm::Value *Args[5] = {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+                          Builder.getInt64(FunctionHash),
+                          Builder.getInt32(CondID - 1),
+                          MCDCCondBitmapAddr.getPointer(), Val};
+  Builder.CreateCall(
+      CGM.getIntrinsic(llvm::Intrinsic::instrprof_mcdc_condbitmap_update),
+      Args);
+}
+
 void CodeGenPGO::setValueProfilingFlag(llvm::Module &M) {
   if (CGM.getCodeGenOpts().hasProfileClangInstr())
     M.addModuleFlag(llvm::Module::Warning, "EnableValueProfiling",
diff --git a/clang/lib/CodeGen/CodeGenPGO.h b/clang/lib/CodeGen/CodeGenPGO.h
index 392ec5a144fee..6596b6c352776 100644
--- a/clang/lib/CodeGen/CodeGenPGO.h
+++ b/clang/lib/CodeGen/CodeGenPGO.h
@@ -33,8 +33,11 @@ class CodeGenPGO {
 
   std::array <unsigned, llvm::IPVK_Last + 1> NumValueSites;
   unsigned NumRegionCounters;
+  unsigned MCDCBitmapBytes;
   uint64_t FunctionHash;
   std::unique_ptr<llvm::DenseMap<const Stmt *, unsigned>> RegionCounterMap;
+  std::unique_ptr<llvm::DenseMap<const Stmt *, unsigned>> RegionMCDCBitmapMap;
+  std::unique_ptr<llvm::DenseMap<const Stmt *, unsigned>> RegionCondIDMap;
   std::unique_ptr<llvm::DenseMap<const Stmt *, uint64_t>> StmtCountMap;
   std::unique_ptr<llvm::InstrProfRecord> ProfRecord;
   std::vector<uint64_t> RegionCounts;
@@ -43,7 +46,8 @@ class CodeGenPGO {
 public:
   CodeGenPGO(CodeGenModule &CGModule)
       : CGM(CGModule), FuncNameVar(nullptr), NumValueSites({{0}}),
-        NumRegionCounters(0), FunctionHash(0), CurrentRegionCount(0) {}
+        NumRegionCounters(0), MCDCBitmapBytes(0), FunctionHash(0),
+        CurrentRegionCount(0) {}
 
   /// Whether or not we have PGO region data for the current function. This is
   /// false both when we have no data at all and when our data has been
@@ -103,10 +107,18 @@ class CodeGenPGO {
                         bool IsInMainFile);
   bool skipRegionMappingForDecl(const Decl *D);
   void emitCounterRegionMapping(const Decl *D);
+  bool canEmitMCDCCoverage(const CGBuilderTy &Builder);
 
 public:
   void emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
                             llvm::Value *StepV);
+  void emitMCDCTestVectorBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
+                                      Address MCDCCondBitmapAddr);
+  void emitMCDCParameters(CGBuilderTy &Builder);
+  void emitMCDCCondBitmapReset(CGBuilderTy &Builder, const Expr *S,
+                               Address MCDCCondBitmapAddr);
+  void emitMCDCCondBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
+                                Address MCDCCondBitmapAddr, llvm::Value *Val);
 
   /// Return the region count for the counter at the given index.
   uint64_t getRegionCount(const Stmt *S) {
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 56411e2240e50..bf227386a71b7 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -95,6 +95,8 @@ void CoverageSourceInfo::updateNextTokLoc(SourceLocation Loc) {
 }
 
 namespace {
+using MCDCConditionID = CounterMappingRegion::MCDCConditionID;
+using MCDCParameters = CounterMappingRegion::MCDCParameters;
 
 /// A region of source code that can be mapped to a counter.
 class SourceMappingRegion {
@@ -104,6 +106,9 @@ class SourceMappingRegion {
   /// Secondary Counter used for Branch Regions for "False" branches.
   std::optional<Counter> FalseCount;
 
+  /// Parameters used for Modified Condition/Decision Coverage
+  MCDCParameters MCDCParams;
+
   /// The region's starting location.
   std::optional<SourceLocation> LocStart;
 
@@ -122,11 +127,18 @@ class SourceMappingRegion {
   }
 
   SourceMappingRegion(Counter Count, std::optional<Counter> FalseCount,
+                      MCDCParameters MCDCParams,
                       std::optional<SourceLocation> LocStart,
                       std::optional<SourceLocation> LocEnd,
                       bool GapRegion = false)
-      : Count(Count), FalseCount(FalseCount), LocStart(LocStart),
-        LocEnd(LocEnd), GapRegion(GapRegion) {}
+      : Count(Count), FalseCount(FalseCount), MCDCParams(MCDCParams),
+        LocStart(LocStart), LocEnd(LocEnd), GapRegion(GapRegion) {}
+
+  SourceMappingRegion(MCDCParameters MCDCParams,
+                      std::optional<SourceLocation> LocStart,
+                      std::optional<SourceLocation> LocEnd)
+      : MCDCParams(MCDCParams), LocStart(LocStart), LocEnd(LocEnd),
+        GapRegion(false) {}
 
   const Counter &getCounter() const { return Count; }
 
@@ -163,6 +175,10 @@ class SourceMappingRegion {
   void setGap(bool Gap) { GapRegion = Gap; }
 
   bool isBranch() const { return FalseCount.has_value(); }
+
+  bool isMCDCDecision() const { return MCDCParams.NumConditions != 0; }
+
+  const MCDCParameters &getMCDCParams() const { return MCDCParams; }
 };
 
 /// Spelling locations for the start and end of a source region.
@@ -454,8 +470,13 @@ class CoverageMappingBuilder {
             SR.LineEnd, SR.ColumnEnd));
       } else if (Region.isBranch()) {
         MappingRegions.push_back(CounterMappingRegion::makeBranchRegion(
-            Region.getCounter(), Region.getFalseCounter(), *CovFileID,
-            SR.LineStart, SR.ColumnStart, SR.LineEnd, SR.ColumnEnd));
+            Region.getCounter(), Region.getFalseCounter(),
+            Region.getMCDCParams(), *CovFileID, SR.LineStart, SR.ColumnStart,
+            SR.LineEnd, SR.ColumnEnd));
+      } else if (Region.isMCDCDecision()) {
+        MappingRegions.push_back(CounterMappingRegion::makeDecisionRegion(
+            Region.getMCDCParams(), *CovFileID, SR.LineStart, SR.ColumnStart,
+            SR.LineEnd, SR.ColumnEnd));
       } else {
         MappingRegions.push_back(CounterMappingRegion::makeRegion(
             Region.getCounter(), *CovFileID, SR.LineStart, SR.ColumnStart,
@@ -542,6 +563,239 @@ struct EmptyCoverageMappingBuilder : public CoverageMappingBuilder {
   }
 };
 
+/// A wrapper object for maintaining stacks to track the resursive AST visitor
+/// walks for the purpose of assigning IDs to leaf-level conditions measured by
+/// MC/DC. The object is created with a reference to the MCDCBitmapMap that was
+/// created during the initial AST walk. The presence of a bitmap associated
+/// with a boolean expression (top-level logical operator nest) indicates that
+/// the boolean expression qualified for MC/DC.  The resulting condition IDs
+/// are preserved in a map reference that is also provided during object
+/// creation.
+struct MCDCCoverageBuilder {
+
+  /// The AST walk recursively visits nested logical-AND or logical-OR binary
+  /// operator nodes and then visits their LHS and RHS children nodes.  As this
+  /// happens, the algorithm will assign IDs to each operator's LHS and RHS side
+  /// as the walk moves deeper into the nest.  At each level of the recursive
+  /// nest, the LHS and RHS may actually correspond to larger subtrees (not
+  /// leaf-conditions). If this is the case, when that node is visited, the ID
+  /// assigned to the subtree is re-assigned to its LHS, and a new ID is given
+  /// to its RHS. At the end of the walk, all leaf-level conditions will have a
+  /// unique ID -- keep in mind that the final set of IDs may not be in
+  /// numerical order from left to right.
+  ///
+  /// Example: "x = (A && B) || (C && D) || (D && F)"
+  ///
+  ///      Visit Depth1:
+  ///              (A && B) || (C && D) || (D && F)
+  ///              ^-------LHS--------^    ^-RHS--^
+  ///                      ID=1              ID=2
+  ///
+  ///      Visit LHS-Depth2:
+  ///              (A && B) || (C && D)
+  ///              ^-LHS--^    ^-RHS--^
+  ///                ID=1        ID=3
+  ///
+  ///      Visit LHS-Depth3:
+  ///               (A && B)
+  ///               LHS   RHS
+  ///               ID=1  ID=4
+  ///
+  ///      Visit RHS-Depth3:
+  ///                         (C && D)
+  ///                         LHS   RHS
+  ///                         ID=3  ID=5
+  ///
+  ///      Visit RHS-Depth2:              (D && F)
+  ///                                     LHS   RHS
+  ///                                     ID=2  ID=6
+  ///
+  ///      Visit Depth1:
+  ///              (A && B)  || (C && D)  || (D && F)
+  ///              ID=1  ID=4   ID=3  ID=5   ID=2  ID=6
+  ///
+  /// A node ID of '0' always means MC/DC isn't being tracked.
+  ///
+  /// As the AST walk proceeds recursively, the algorithm will also use stacks
+  /// to track the IDs of logical-AND and logical-OR operations on the RHS so
+  /// that it can be determined which nodes are executed next, depending on how
+  /// a LHS or RHS of a logical-AND or logical-OR is evaluated.  This
+  /// information relies on the assigned IDs and are embedded within the
+  /// coverage region IDs of each branch region associated with a leaf-level
+  /// condition. This information helps the visualization tool reconstruct all
+  /// possible test vectors for the purposes of MC/DC analysis. if a "next" node
+  /// ID is '0', it means it's the end of the test vector. The following rules
+  /// are used:
+  ///
+  /// For logical-AND ("LHS && RHS"):
+  /// - If LHS is TRUE, execution goes to the RHS node.
+  /// - If LHS is FALSE, execution goes to the LHS node of the next logical-OR.
+  ///   If that does not exist, execution exits (ID == 0).
+  ///
+  /// - If RHS is TRUE, execution goes to LHS node of the next logical-AND.
+  ///   If that does not exist, execution exits (ID == 0).
+  /// - If RHS is FALSE, execution goes to the LHS node of the next logical-OR.
+  ///   If that does not exist, execution exits (ID == 0).
+  ///
+  /// For logical-OR ("LHS || RHS"):
+  /// - If LHS is TRUE, execution goes to the LHS node of the next logical-AND.
+  ///   If that does not exist, execution exits (ID == 0).
+  /// - If LHS is FALSE, execution goes to the RHS node.
+  ///
+  /// - If RHS is TRUE, execution goes to LHS node of the next logical-AND.
+  ///   If that does not exist, execution exits (ID == 0).
+  /// - If RHS is FALSE, execution goes to the LHS node of the next logical-OR.
+  ///   If that does not exist, execution exits (ID == 0).
+  ///
+  /// Finally, the condition IDs are also used when instrumenting the code to
+  /// indicate a unique offset into a temporary bitmap that represents the true
+  /// or false evaluation of that particular condition.
+  ///
+  /// NOTE regarding the use of CodeGenFunction::stripCond(). Even though, for
+  /// simplicity, parentheses and unary logical-NOT operators are considered
+  /// part of their underlying condition for both MC/DC and branch coverage, the
+  /// condition IDs themselves are assigned and tracked using the underlying
+  /// condition itself.  This is done solely for consistency since parentheses
+  /// and logical-NOTs are ignored when checking whether the condition is
+  /// actually an instrumentable condition. This can also make debugging a bit
+  /// easier.
+
+private:
+  CodeGenModule &CGM;
+
+  llvm::SmallVector<MCDCConditionID> AndRHS;
+  llvm::SmallVector<MCDCConditionID> OrRHS;
+  llvm::SmallVector<const BinaryOperator *> NestLevel;
+  llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDs;
+  llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap;
+  MCDCConditionID NextID = 1;
+  bool NotMapped = false;
+
+  /// Is this a logical-AND operation?
+  bool isLAnd(const BinaryOperator *E) const {
+    return E->getOpcode() == BO_LAnd;
+  }
+
+  /// Push an ID onto the corresponding RHS stack.
+  void pushRHS(const BinaryOperator *E) {
+    llvm::SmallVector<MCDCConditionID> &rhs = isLAnd(E) ? AndRHS : OrRHS;
+    rhs.push_back(CondIDs[CodeGenFunction::stripCond(E->getRHS())]);
+  }
+
+  /// Pop an ID from the corresponding RHS stack.
+  void popRHS(const BinaryOperator *E) {
+    llvm::SmallVector<MCDCConditionID> &rhs = isLAnd(E) ? AndRHS : OrRHS;
+    if (!rhs.empty())
+      rhs.pop_back();
+  }
+
+  /// If the expected ID is on top, pop it off the corresponding RHS stack.
+  void popRHSifTop(const BinaryOperator *E) {
+    if (!OrRHS.empty() && CondIDs[E] == OrRHS.back())
+      OrRHS.pop_back();
+    else if (!AndRHS.empty() && CondIDs[E] == AndRHS.back())
+      AndRHS.pop_back();
+  }
+
+public:
+  MCDCCoverageBuilder(CodeGenModule &CGM,
+                      llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDMap,
+                      llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap)
+      : CGM(CGM), CondIDs(CondIDMap), MCDCBitmapMap(MCDCBitmapMap) {}
+
+  /// Return the ID of the RHS of the next, upper nest-level logical-OR.
+  MCDCConditionID getNextLOrCondID() const {
+    return OrRHS.empty() ? 0 : OrRHS.back();
+  }
+
+  /// Return the ID of the RHS of the next, upper nest-level logical-AND.
+  MCDCConditionID getNextLAndCondID() const {
+    return AndRHS.empty() ? 0 : AndRHS.back();
+  }
+
+  /// Return the ID of a given condition.
+  MCDCConditionID getCondID(const Expr *Cond) const {
+    auto I = CondIDs.find(CodeGenFunction::stripCond(Cond));
+    if (I == CondIDs.end())
+      return 0;
+    else
+      return I->second;
+  }
+
+  /// Push the binary operator statement to track the nest level and assign IDs
+  /// to the operator's LHS and RHS.  The RHS may be a larger subtree that is
+  /// broken up on successive levels.
+  void pushAndAssignIDs(const BinaryOperator *E) {
+    if (!CGM.getCodeGenOpts().MCDCCoverage)
+      return;
+
+    // If binary expression is disqualified, don't do mapping.
+    if (NestLevel.empty() && MCDCBitmapMap.find(CodeGenFunction::stripCond(
+                                 E)) == MCDCBitmapMap.end())
+      NotMapped = true;
+
+    // Push Stmt on 'NestLevel' stack to keep track of nest location.
+    NestLevel.push_back(E);
+
+    // Don't go any further if we don't need to map condition IDs.
+    if (NotMapped)
+      return;
+
+    // If the operator itself has an assigned ID, this means it represents a
+    // larger subtree.  In this case, pop its ID out of the RHS stack and
+    // assign that ID to its LHS node.  Its RHS will receive a new ID.
+    if (CondIDs.find(CodeGenFunction::stripCond(E)) != CondIDs.end()) {
+      // If Stmt has an ID, assign its ID to LHS
+      CondIDs[CodeGenFunction::stripCond(E->getLHS())] = CondIDs[E];
+
+      // Since the operator's LHS assumes the operator's same ID, pop the
+      // operator from the RHS stack so that if LHS short-circuits, it won't be
+      // incorrectly re-used as the node executed next.
+      popRHSifTop(E);
+    } else {
+      // Otherwise, assign ID+1 to LHS.
+      CondIDs[CodeGenFunction::stripCond(E->getLHS())] = NextID++;
+    }
+
+    // Assign ID+1 to RHS.
+    CondIDs[CodeGenFunction::stripCond(E->getRHS())] = NextID++;
+
+    // Push ID of Stmt's RHS so that LHS nodes know about it
+    pushRHS(E);
+  }
+
+  /// Pop the binary operator from the next level. If the walk is at the top of
+  /// the next, assign the total number of conditions.
+  unsigned popAndReturnCondCount(const BinaryOperator *E) {
+    if (!CGM.getCodeGenOpts().MCDCCoverage)
+      return 0;
+
+    unsigned TotalConds = 0;
+
+    // Pop Stmt from 'NestLevel' stack.
+    assert(NestLevel.back() == E);
+    NestLevel.pop_back();
+
+    // Reset state if not doing mapping.
+    if (NestLevel.empty() && NotMapped) {
+      NotMapped = false;
+      return 0;
+    }
+
+    // Pop RHS ID.
+    popRHS(E);
+
+    // If at the parent (NestLevel=0), set conds and reset.
+    if (NestLevel.empty()) {
+      TotalConds = NextID - 1;
+
+      // Reset ID back to beginning.
+      NextID = 1;
+    }
+    return TotalConds;
+  }
+};
+
 /// A StmtVisitor that creates coverage mapping regions which map
 /// from the source code locations to the PGO counters.
 struct CounterCoverageMappingBuilder
@@ -550,8 +804,14 @@ struct CounterCoverageMappingBuilder
   /// The map of statements to count values.
   llvm::DenseMap<const Stmt *, unsigned> &CounterMap;
 
+  /// The map of statements to bitmap coverage object values.
+  llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap;
+
   /// A stack of currently live regions.
-  std::vector<SourceMappingRegion> RegionStack;
+  llvm::SmallVector<SourceMappingRegion> RegionStack;
+
+  /// An object to manage MCDC regions.
+  MCDCCoverageBuilder MCDCBuilder;
 
   CounterExpressionBuilder Builder;
 
@@ -589,6 +849,8 @@ struct CounterCoverageMappingBuilder
     return Counter::getCounter(CounterMap[S]);
   }
 
+  unsigned getRegionBitmap(const Stmt *S) { return MCDCBitmapMap[S]; }
+
   /// Push a region onto the stack.
   ///
   /// Returns the index on the stack where the region was pushed. This can be
@@ -596,7 +858,9 @@ struct CounterCoverageMappingBuilder
   size_t pushRegion(Counter Count,
                     std::optional<SourceLocation> StartLoc = std::nullopt,
                     std::optional<SourceLocation> EndLoc = std::nullopt,
-                    std::optional<Counter> FalseCount = std::nullopt) {
+                    std::optional<Counter> FalseCount = std::nullopt,
+                    MCDCConditionID ID = 0, MCDCConditionID TrueID = 0,
+                    MCDCConditionID FalseID = 0) {
 
     if (StartLoc && !FalseCount) {
       MostRecentLocation = *StartLoc;
@@ -615,7 +879,19 @@ struct CounterCoverageMappingBuilder
       StartLoc = std::nullopt;
     if (EndLoc && EndLoc->isInvalid())
       EndLoc = std::nullopt;
-    RegionStack.emplace_back(Count, FalseCount, StartLoc, EndLoc);
+    RegionStack.emplace_back(Count, FalseCount,
+                             MCDCParameters{0, 0, ID, TrueID, FalseID},
+                             StartLoc, EndLoc);
+
+    return RegionStack.size() - 1;
+  }
+
+  size_t pushRegion(unsigned BitmapIdx, unsigned Conditions,
+                    std::optional<SourceLocation> StartLoc = std::nullopt,
+                    std::optional<SourceLocation> EndLoc = std::nullopt) {
+
+    RegionStack.emplace_back(MCDCParameters{BitmapIdx, Conditions}, StartLoc,
+                             EndLoc);
 
     return RegionStack.size() - 1;
   }
@@ -746,7 +1022,9 @@ struct CounterCoverageMappingBuilder
   /// and add it to the function's SourceRegions.  A branch region tracks a
   /// "True" counter and a "False" counter for boolean expressions that
   /// result in the generation of a branch.
-  void createBranchRegion(const Expr *C, Counter TrueCnt, Counter FalseCnt) {
+  void createBranchRegion(const Expr *C, Counter TrueCnt, Counter FalseCnt,
+                          MCDCConditionID ID = 0, MCDCConditionID TrueID = 0,
+                          MCDCConditionID FalseID = 0) {
     // Check for NULL conditions.
     if (!C)
       return;
@@ -764,13 +1042,21 @@ struct CounterCoverageMappingBuilder
       // CodeGenFunction.c always returns false, but that is very heavy-handed.
       if (ConditionFoldsToBool(C))
         popRegions(pushRegion(Counter::getZero(), getStart(C), getEnd(C),
-                              Counter::getZero()));
+                              Counter::getZero(), ID, TrueID, FalseID));
       else
         // Otherwise, create a region with the True counter and False counter.
-        popRegions(pushRegion(TrueCnt, getStart(C), getEnd(C), FalseCnt));
+        popRegions(pushRegion(TrueCnt, getStart(C), getEnd(C), FalseCnt, ID,
+                              TrueID, FalseID));
     }
   }
 
+  /// Create a Decision Region with a BitmapIdx and number of Conditions. This
+  /// type of region "contains" branch regions, one for each of the conditions.
+  /// The visualization tool will group everything together.
+  void createDecisionRegion(const Expr *C, unsigned BitmapIdx, unsigned Conds) {
+    popRegions(pushRegion(BitmapIdx, Conds, getStart(C), getEnd(C)));
+  }
+
   /// Create a Branch Region around a SwitchCase for code coverage
   /// and add it to the function's SourceRegions.
   void createSwitchCaseRegion(const SwitchCase *SC, Counter TrueCnt,
@@ -851,8 +1137,12 @@ struct CounterCoverageMappingBuilder
         // we've seen this region.
         if (StartLocs.insert(Loc).second) {
           if (I.isBranch())
-            SourceRegions.emplace_back(I.getCounter(), I.getFalseCounter(), Loc,
-                                       getEndOfFileOrMacro(Loc), I.isBranch());
+            SourceRegions.emplace_back(
+                I.getCounter(), I.getFalseCounter(),
+                MCDCParameters{0, 0, I.getMCDCParams().ID,
+                               I.getMCDCParams().TrueID,
+                               I.getMCDCParams().FalseID},
+                Loc, getEndOfFileOrMacro(Loc), I.isBranch());
           else
             SourceRegions.emplace_back(I.getCounter(), Loc,
                                        getEndOfFileOrMacro(Loc));
@@ -971,9 +1261,13 @@ struct CounterCoverageMappingBuilder
 
   CounterCoverageMappingBuilder(
       CoverageMappingModuleGen &CVM,
-      llvm::DenseMap<const Stmt *, unsigned> &CounterMap, SourceManager &SM,
-      const LangOptions &LangOpts)
-      : CoverageMappingBuilder(CVM, SM, LangOpts), CounterMap(CounterMap) {}
+      llvm::DenseMap<const Stmt *, unsigned> &CounterMap,
+      llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap,
+      llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDMap,
+      SourceManager &SM, const LangOptions &LangOpts)
+      : CoverageMappingBuilder(CVM, SM, LangOpts), CounterMap(CounterMap),
+        MCDCBitmapMap(MCDCBitmapMap),
+        MCDCBuilder(CVM.getCodeGenModule(), CondIDMap, MCDCBitmapMap) {}
 
   /// Write the mapping data to the output stream
   void write(llvm::raw_ostream &OS) {
@@ -1519,6 +1813,9 @@ struct CounterCoverageMappingBuilder
   }
 
   void VisitBinLAnd(const BinaryOperator *E) {
+    // Keep track of Binary Operator and assign MCDC condition IDs
+    MCDCBuilder.pushAndAssignIDs(E);
+
     extendRegion(E->getLHS());
     propagateCounts(getRegion().getCounter(), E->getLHS());
     handleFileExit(getEnd(E->getLHS()));
@@ -1527,6 +1824,11 @@ struct CounterCoverageMappingBuilder
     extendRegion(E->getRHS());
     propagateCounts(getRegionCounter(E), E->getRHS());
 
+    // Process Binary Operator and create MCDC Decision Region if top-level
+    unsigned NumConds = 0;
+    if ((NumConds = MCDCBuilder.popAndReturnCondCount(E)))
+      createDecisionRegion(E, getRegionBitmap(E), NumConds);
+
     // Extract the RHS's Execution Counter.
     Counter RHSExecCnt = getRegionCounter(E);
 
@@ -1536,13 +1838,30 @@ struct CounterCoverageMappingBuilder
     // Extract the Parent Region Counter.
     Counter ParentCnt = getRegion().getCounter();
 
+    // Extract the MCDC condition IDs (returns 0 if not needed).
+    MCDCConditionID NextOrID = MCDCBuilder.getNextLOrCondID();
+    MCDCConditionID NextAndID = MCDCBuilder.getNextLAndCondID();
+    MCDCConditionID LHSid = MCDCBuilder.getCondID(E->getLHS());
+    MCDCConditionID RHSid = MCDCBuilder.getCondID(E->getRHS());
+
     // Create Branch Region around LHS condition.
+    // MC/DC: For "LHS && RHS"
+    // - If LHS is TRUE, execution goes to the RHS.
+    // - If LHS is FALSE, execution goes to the LHS of the next logical-OR.
+    //   If that does not exist, execution exits (ID == 0).
     createBranchRegion(E->getLHS(), RHSExecCnt,
-                       subtractCounters(ParentCnt, RHSExecCnt));
+                       subtractCounters(ParentCnt, RHSExecCnt), LHSid, RHSid,
+                       NextOrID);
 
     // Create Branch Region around RHS condition.
+    // MC/DC: For "LHS && RHS"
+    // - If RHS is TRUE, execution goes to LHS of the next logical-AND.
+    //   If that does not exist, execution exits (ID == 0).
+    // - If RHS is FALSE, execution goes to the LHS of the next logical-OR.
+    //   If that does not exist, execution exits (ID == 0).
     createBranchRegion(E->getRHS(), RHSTrueCnt,
-                       subtractCounters(RHSExecCnt, RHSTrueCnt));
+                       subtractCounters(RHSExecCnt, RHSTrueCnt), RHSid,
+                       NextAndID, NextOrID);
   }
 
   // Determine whether the right side of OR operation need to be visited.
@@ -1556,6 +1875,9 @@ struct CounterCoverageMappingBuilder
   }
 
   void VisitBinLOr(const BinaryOperator *E) {
+    // Keep track of Binary Operator and assign MCDC condition IDs
+    MCDCBuilder.pushAndAssignIDs(E);
+
     extendRegion(E->getLHS());
     Counter OutCount = propagateCounts(getRegion().getCounter(), E->getLHS());
     handleFileExit(getEnd(E->getLHS()));
@@ -1564,6 +1886,11 @@ struct CounterCoverageMappingBuilder
     extendRegion(E->getRHS());
     propagateCounts(getRegionCounter(E), E->getRHS());
 
+    // Process Binary Operator and create MCDC Decision Region if top-level
+    unsigned NumConds = 0;
+    if ((NumConds = MCDCBuilder.popAndReturnCondCount(E)))
+      createDecisionRegion(E, getRegionBitmap(E), NumConds);
+
     // Extract the RHS's Execution Counter.
     Counter RHSExecCnt = getRegionCounter(E);
 
@@ -1577,13 +1904,28 @@ struct CounterCoverageMappingBuilder
     // Extract the Parent Region Counter.
     Counter ParentCnt = getRegion().getCounter();
 
+    // Extract the MCDC condition IDs (returns 0 if not needed).
+    MCDCConditionID NextOrID = MCDCBuilder.getNextLOrCondID();
+    MCDCConditionID NextAndID = MCDCBuilder.getNextLAndCondID();
+    MCDCConditionID LHSid = MCDCBuilder.getCondID(E->getLHS());
+    MCDCConditionID RHSid = MCDCBuilder.getCondID(E->getRHS());
+
     // Create Branch Region around LHS condition.
+    // MC/DC: For "LHS || RHS"
+    // - If LHS is TRUE, execution goes to the LHS of the next logical-AND.
+    //   If that does not exist, execution exits (ID == 0).
+    // - If LHS is FALSE, execution goes to the RHS.
     createBranchRegion(E->getLHS(), subtractCounters(ParentCnt, RHSExecCnt),
-                       RHSExecCnt);
+                       RHSExecCnt, LHSid, NextAndID, RHSid);
 
     // Create Branch Region around RHS condition.
+    // MC/DC: For "LHS || RHS"
+    // - If RHS is TRUE, execution goes to LHS of the next logical-AND.
+    //   If that does not exist, execution exits (ID == 0).
+    // - If RHS is FALSE, execution goes to the LHS of the next logical-OR.
+    //   If that does not exist, execution exits (ID == 0).
     createBranchRegion(E->getRHS(), subtractCounters(RHSExecCnt, RHSFalseCnt),
-                       RHSFalseCnt);
+                       RHSFalseCnt, RHSid, NextAndID, NextOrID);
   }
 
   void VisitLambdaExpr(const LambdaExpr *LE) {
@@ -1633,11 +1975,23 @@ static void dump(llvm::raw_ostream &OS, StringRef FunctionName,
 
     OS << "File " << R.FileID << ", " << R.LineStart << ":" << R.ColumnStart
        << " -> " << R.LineEnd << ":" << R.ColumnEnd << " = ";
-    Ctx.dump(R.Count, OS);
 
-    if (R.Kind == CounterMappingRegion::BranchRegion) {
-      OS << ", ";
-      Ctx.dump(R.FalseCount, OS);
+    if (R.Kind == CounterMappingRegion::MCDCDecisionRegion) {
+      OS << "M:" << R.MCDCParams.BitmapIdx;
+      OS << ", C:" << R.MCDCParams.NumConditions;
+    } else {
+      Ctx.dump(R.Count, OS);
+
+      if (R.Kind == CounterMappingRegion::BranchRegion ||
+          R.Kind == CounterMappingRegion::MCDCBranchRegion) {
+        OS << ", ";
+        Ctx.dump(R.FalseCount, OS);
+      }
+    }
+
+    if (R.Kind == CounterMappingRegion::MCDCBranchRegion) {
+      OS << " [" << R.MCDCParams.ID << "," << R.MCDCParams.TrueID;
+      OS << "," << R.MCDCParams.FalseID << "] ";
     }
 
     if (R.Kind == CounterMappingRegion::ExpansionRegion)
@@ -1846,8 +2200,9 @@ unsigned CoverageMappingModuleGen::getFileID(FileEntryRef File) {
 
 void CoverageMappingGen::emitCounterMapping(const Decl *D,
                                             llvm::raw_ostream &OS) {
-  assert(CounterMap);
-  CounterCoverageMappingBuilder Walker(CVM, *CounterMap, SM, LangOpts);
+  assert(CounterMap && MCDCBitmapMap);
+  CounterCoverageMappingBuilder Walker(CVM, *CounterMap, *MCDCBitmapMap,
+                                       *CondIDMap, SM, LangOpts);
   Walker.VisitDecl(D);
   Walker.write(OS);
 }
diff --git a/clang/lib/CodeGen/CoverageMappingGen.h b/clang/lib/CodeGen/CoverageMappingGen.h
index 77d7c6cd87cfb..62cea173c9fc9 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.h
+++ b/clang/lib/CodeGen/CoverageMappingGen.h
@@ -150,16 +150,22 @@ class CoverageMappingGen {
   SourceManager &SM;
   const LangOptions &LangOpts;
   llvm::DenseMap<const Stmt *, unsigned> *CounterMap;
+  llvm::DenseMap<const Stmt *, unsigned> *MCDCBitmapMap;
+  llvm::DenseMap<const Stmt *, unsigned> *CondIDMap;
 
 public:
   CoverageMappingGen(CoverageMappingModuleGen &CVM, SourceManager &SM,
                      const LangOptions &LangOpts)
-      : CVM(CVM), SM(SM), LangOpts(LangOpts), CounterMap(nullptr) {}
+      : CVM(CVM), SM(SM), LangOpts(LangOpts), CounterMap(nullptr),
+        MCDCBitmapMap(nullptr), CondIDMap(nullptr) {}
 
   CoverageMappingGen(CoverageMappingModuleGen &CVM, SourceManager &SM,
                      const LangOptions &LangOpts,
-                     llvm::DenseMap<const Stmt *, unsigned> *CounterMap)
-      : CVM(CVM), SM(SM), LangOpts(LangOpts), CounterMap(CounterMap) {}
+                     llvm::DenseMap<const Stmt *, unsigned> *CounterMap,
+                     llvm::DenseMap<const Stmt *, unsigned> *MCDCBitmapMap,
+                     llvm::DenseMap<const Stmt *, unsigned> *CondIDMap)
+      : CVM(CVM), SM(SM), LangOpts(LangOpts), CounterMap(CounterMap),
+        MCDCBitmapMap(MCDCBitmapMap), CondIDMap(CondIDMap) {}
 
   /// Emit the coverage mapping data which maps the regions of
   /// code to counters that will be used to find the execution
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index acfa119805068..2d8ef841d4f6b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -698,6 +698,17 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
     CmdArgs.push_back("-fcoverage-mapping");
   }
 
+  if (Args.hasFlag(options::OPT_fmcdc_coverage, options::OPT_fno_mcdc_coverage,
+                   false)) {
+    if (!Args.hasFlag(options::OPT_fcoverage_mapping,
+                      options::OPT_fno_coverage_mapping, false))
+      D.Diag(clang::diag::err_drv_argument_only_allowed_with)
+          << "-fcoverage-mcdc"
+          << "-fcoverage-mapping";
+
+    CmdArgs.push_back("-fcoverage-mcdc");
+  }
+
   if (Arg *A = Args.getLastArg(options::OPT_ffile_compilation_dir_EQ,
                                options::OPT_fcoverage_compilation_dir_EQ)) {
     if (A->getOption().matches(options::OPT_ffile_compilation_dir_EQ))
diff --git a/clang/test/CoverageMapping/branch-constfolded.cpp b/clang/test/CoverageMapping/branch-constfolded.cpp
index 5173286addbb4..4fdb640506c9d 100644
--- a/clang/test/CoverageMapping/branch-constfolded.cpp
+++ b/clang/test/CoverageMapping/branch-constfolded.cpp
@@ -1,90 +1,100 @@
 // Test that branch regions are not generated for constant-folded conditions.
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name branch-constfolded.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name branch-constfolded.cpp %s | FileCheck %s -check-prefix=MCDC
 
 // CHECK-LABEL: _Z6fand_0b:
-bool fand_0(bool a) {
+bool fand_0(bool a) {      // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:20 = M:0, C:2
   return false && a;       // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, 0
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:20 = #2, (#1 - #2)
 
 // CHECK-LABEL: _Z6fand_1b:
-bool fand_1(bool a) {
+bool fand_1(bool a) {      // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:19 = M:0, C:2
   return a && true;        // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = #1, (#0 - #1)
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = 0, 0
 
 // CHECK-LABEL: _Z6fand_2bb:
-bool fand_2(bool a, bool b) {
+bool fand_2(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:25 = M:0, C:3
   return false && a && b;  // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, 0
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:20 = #4, (#3 - #4)
                            // CHECK: Branch,File 0, [[@LINE-2]]:24 -> [[@LINE-2]]:25 = #2, (#1 - #2)
 
 // CHECK-LABEL: _Z6fand_3bb:
-bool fand_3(bool a, bool b) {
+bool fand_3(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:24 = M:0, C:3
   return a && true && b;   // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = #3, (#0 - #3)
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = 0, 0
                            // CHECK: Branch,File 0, [[@LINE-2]]:23 -> [[@LINE-2]]:24 = #2, (#1 - #2)
 
 // CHECK-LABEL: _Z6fand_4bb:
-bool fand_4(bool a, bool b) {
+bool fand_4(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:25 = M:0, C:3
   return a && b && false;  // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = #3, (#0 - #3)
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:16 = #4, (#3 - #4)
                            // CHECK: Branch,File 0, [[@LINE-2]]:20 -> [[@LINE-2]]:25 = 0, 0
 
 // CHECK-LABEL: _Z6fand_5b:
-bool fand_5(bool a) {
+bool fand_5(bool a) {      // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:23 = M:0, C:2
   return false && true;    // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, 0
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:23 = 0, 0
 
 // CHECK-LABEL: _Z6fand_6b:
-bool fand_6(bool a) {
+bool fand_6(bool a) {      // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:19 = M:0, C:2
   return true && a;        // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = 0, 0
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:19 = #2, (#1 - #2)
 
 // CHECK-LABEL: _Z6fand_7b:
-bool fand_7(bool a) {
+bool fand_7(bool a) {      // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:20 = M:0, C:2
   return a && false;       // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = #1, (#0 - #1)
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, 0
 
 // CHECK-LABEL: _Z5for_0b:
-bool for_0(bool a) {
+bool for_0(bool a) {       // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:19 = M:0, C:2
   return true || a;        // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = 0, 0
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:19 = (#1 - #2), #2
 
 // CHECK-LABEL: _Z5for_1b:
-bool for_1(bool a) {
+bool for_1(bool a) {       // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:20 = M:0, C:2
   return a || false;       // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = (#0 - #1), #1
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, 0
 
 // CHECK-LABEL: _Z5for_2bb:
-bool for_2(bool a, bool b) {
+bool for_2(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:24 = M:0, C:3
   return true || a || b;   // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = 0, 0
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:19 = (#3 - #4), #4
                            // CHECK: Branch,File 0, [[@LINE-2]]:23 -> [[@LINE-2]]:24 = (#1 - #2), #2
 
 // CHECK-LABEL: _Z5for_3bb:
-bool for_3(bool a, bool b) {
+bool for_3(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:25 = M:0, C:3
   return a || false || b;  // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = (#0 - #3), #3
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, 0
                            // CHECK: Branch,File 0, [[@LINE-2]]:24 -> [[@LINE-2]]:25 = (#1 - #2), #2
 
 // CHECK-LABEL: _Z5for_4bb:
-bool for_4(bool a, bool b) {
+bool for_4(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:24 = M:0, C:3
   return a || b || true;   // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = (#0 - #3), #3
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:16 = (#3 - #4), #4
                            // CHECK: Branch,File 0, [[@LINE-2]]:20 -> [[@LINE-2]]:24 = 0, 0
 
 // CHECK-LABEL: _Z5for_5b:
-bool for_5(bool a) {
+bool for_5(bool a) {       // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:23 = M:0, C:2
   return true || false;    // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = 0, 0
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:23 = 0, 0
 
 // CHECK-LABEL: _Z5for_6b:
-bool for_6(bool a) {
+bool for_6(bool a) {       // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:20 = M:0, C:2
   return false || a;       // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, 0
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:20 = (#1 - #2), #2
 
 // CHECK-LABEL: _Z5for_7b:
-bool for_7(bool a) {
+bool for_7(bool a) {       // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:19 = M:0, C:2
   return a || true;        // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = (#0 - #1), #1
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = 0, 0
 
+// CHECK-LABEL: _Z5for_8b:
+bool for_8(bool a) {       // MCDC: Decision,File 0, [[@LINE+3]]:17 -> [[@LINE+3]]:30 = M:0, C:2
+                           // CHECK: Branch,File 0, [[@LINE+2]]:17 -> [[@LINE+2]]:21 = 0, 0
+                           // CHECK: Branch,File 0, [[@LINE+1]]:25 -> [[@LINE+1]]:30 = 0, 0
+  if constexpr (true && false)
+      return true;
+  else
+      return false;
+}
diff --git a/clang/test/CoverageMapping/branch-mincounters.cpp b/clang/test/CoverageMapping/branch-mincounters.cpp
index 6d4341cbeafd6..aa1d3928aae25 100644
--- a/clang/test/CoverageMapping/branch-mincounters.cpp
+++ b/clang/test/CoverageMapping/branch-mincounters.cpp
@@ -2,6 +2,7 @@
 // logical operators on branch conditions for branch coverage.
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name branch-logical-mixed.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name branch-logical-mixed.cpp %s | FileCheck %s
 
 
 // CHECK-LABEL: _Z5func1ii:
diff --git a/clang/test/CoverageMapping/branch-templates.cpp b/clang/test/CoverageMapping/branch-templates.cpp
index 1fd01218c9038..4e43626b8192d 100644
--- a/clang/test/CoverageMapping/branch-templates.cpp
+++ b/clang/test/CoverageMapping/branch-templates.cpp
@@ -2,6 +2,7 @@
 // instantiations.
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name branch-templates.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name branch-templates.cpp %s | FileCheck %s
 
 template<typename T>
 void unused(T x) {
diff --git a/clang/test/CoverageMapping/if.cpp b/clang/test/CoverageMapping/if.cpp
index 4326add3b3443..65e3d62df79db 100644
--- a/clang/test/CoverageMapping/if.cpp
+++ b/clang/test/CoverageMapping/if.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fms-extensions -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++23 -triple %itanium_abi_triple -main-file-name if.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fms-extensions -mllvm -emptyline-comment-coverage=false -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++2b -triple %itanium_abi_triple -main-file-name if.cpp %s | FileCheck %s
 
 int nop() { return 0; }
 struct S {
diff --git a/clang/test/CoverageMapping/logical.cpp b/clang/test/CoverageMapping/logical.cpp
index cd1a388ec4c29..7de59e1429808 100644
--- a/clang/test/CoverageMapping/logical.cpp
+++ b/clang/test/CoverageMapping/logical.cpp
@@ -1,22 +1,24 @@
 // RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %s | FileCheck %s -check-prefix=MCDC
 
-int main() {                        // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+22]]:2 = #0
+int main() {                        // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+23]]:2 = #0
   bool bt = true;
-  bool bf = false;
+  bool bf = false;                  // MCDC: Decision,File 0, [[@LINE+1]]:12 -> [[@LINE+1]]:20 = M:0, C:2
   bool a = bt && bf;                // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE]]:14 = #0
                                     // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:12 -> [[@LINE-1]]:14 = #1, (#0 - #1)
                                     // CHECK-NEXT: File 0, [[@LINE-2]]:18 -> [[@LINE-2]]:20 = #1
                                     // CHECK-NEXT: Branch,File 0, [[@LINE-3]]:18 -> [[@LINE-3]]:20 = #2, (#1 - #2)
-
+                                    // MCDC: Decision,File 0, [[@LINE+1]]:7 -> [[@LINE+2]]:9 = M:1, C:2
   a = bt &&                         // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:9 = #0
       bf;                           // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:7 -> [[@LINE-1]]:9 = #3, (#0 - #3)
                                     // CHECK-NEXT: File 0, [[@LINE-1]]:7 -> [[@LINE-1]]:9 = #3
                                     // CHECK-NEXT: Branch,File 0, [[@LINE-2]]:7 -> [[@LINE-2]]:9 = #4, (#3 - #4)
+                                    // MCDC: Decision,File 0, [[@LINE+1]]:7 -> [[@LINE+1]]:15 = M:2, C:2
   a = bf || bt;                     // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:9 = #0
                                     // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:7 -> [[@LINE-1]]:9 = (#0 - #5), #5
                                     // CHECK-NEXT: File 0, [[@LINE-2]]:13 -> [[@LINE-2]]:15 = #5
                                     // CHECK-NEXT: Branch,File 0, [[@LINE-3]]:13 -> [[@LINE-3]]:15 = (#5 - #6), #6
-
+                                    // MCDC: Decision,File 0, [[@LINE+1]]:7 -> [[@LINE+2]]:9 = M:3, C:2
   a = bf ||                         // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:9 = #0
       bt;                           // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:7 -> [[@LINE-1]]:9 = (#0 - #7), #7
                                     // CHECK-NEXT: File 0, [[@LINE-1]]:7 -> [[@LINE-1]]:9 = #7
diff --git a/clang/test/CoverageMapping/mcdc-class.cpp b/clang/test/CoverageMapping/mcdc-class.cpp
new file mode 100644
index 0000000000000..dcf6123ee0fc7
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-class.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+extern void foo();
+extern void bar();
+class Value {
+  public:
+    void setValue( int len );
+    int getValue( void );
+    Value();   // This is the constructor declaration
+    ~Value();  // This is the destructor declaration
+
+  private:
+    int value;
+};
+
+// Member functions definitions including constructor
+Value::Value(void) {
+  if (value == 2 || value == 6)
+    foo();
+}
+Value::~Value(void) {
+  if (value == 2 || value == 3)
+    bar();
+}
+
+// CHECK-LABEL:  Decision,File 0, 18:7 -> 18:31 = M:0, C:2
+// CHECK-NEXT:  Branch,File 0, 18:7 -> 18:17 = (#0 - #2), #2 [1,0,2]
+// CHECK:  Branch,File 0, 18:21 -> 18:31 = (#2 - #3), #3 [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 22:7 -> 22:31 = M:0, C:2
+// CHECK-NEXT:  Branch,File 0, 22:7 -> 22:17 = (#0 - #2), #2 [1,0,2]
+// CHECK:  Branch,File 0, 22:21 -> 22:31 = (#2 - #3), #3 [2,0,0]
diff --git a/clang/test/CoverageMapping/mcdc-error-conditions.cpp b/clang/test/CoverageMapping/mcdc-error-conditions.cpp
new file mode 100644
index 0000000000000..d34ed69343479
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-error-conditions.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s 2>&1| FileCheck %s
+
+bool func_conditions(bool a, bool b, bool c, bool d, bool e, bool f, bool g) {
+  return a && b && c && d && e && f && g;
+}
+
+// CHECK: warning: unsupported MC/DC boolean expression; number of conditions{{.*}} exceeds max
diff --git a/clang/test/CoverageMapping/mcdc-error-nests.cpp b/clang/test/CoverageMapping/mcdc-error-nests.cpp
new file mode 100644
index 0000000000000..3add2b9ccd3fb
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-error-nests.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s 2>&1| FileCheck %s
+
+// "Split-nest" -- boolean expressions within boolean expressions.
+extern bool bar(bool);
+bool func_split_nest(bool a, bool b, bool c, bool d, bool e, bool f, bool g) {
+  bool res = a && b && c && bar(d && e) && f && g;
+  return bar(res);
+}
+
+// CHECK: warning: unsupported MC/DC boolean expression; contains an operation with a nested boolean expression.
diff --git a/clang/test/CoverageMapping/mcdc-logical-scalar-ids.cpp b/clang/test/CoverageMapping/mcdc-logical-scalar-ids.cpp
new file mode 100644
index 0000000000000..c820b5df5ad3a
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-logical-scalar-ids.cpp
@@ -0,0 +1,110 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+extern bool bar (bool, bool, bool, bool, bool);
+bool func_scalar_and(bool a, bool b, bool c, bool d, bool e, bool f) {
+    bool res1 = a && b;
+    bool res2 = a && b && c;
+    bool res3 = a && b && c && d;
+    bool res4 = a && b && c && d && e;
+    bool res5 = a && b && c && d && e && f;
+    return bar(res1, res2, res3, res4, res5);
+}
+
+// CHECK-LABEL: Decision,File 0, 5:17 -> 5:23 = M:0, C:2
+// CHECK-NEXT: Branch,File 0, 5:17 -> 5:18 = #1, (#0 - #1) [1,2,0]
+// CHECK: Branch,File 0, 5:22 -> 5:23 = #2, (#1 - #2) [2,0,0]
+// CHECK-LABEL: Decision,File 0, 6:17 -> 6:28 = M:1, C:3
+// CHECK-NEXT: Branch,File 0, 6:17 -> 6:18 = #5, (#0 - #5) [1,3,0]
+// CHECK: Branch,File 0, 6:22 -> 6:23 = #6, (#5 - #6) [3,2,0]
+// CHECK: Branch,File 0, 6:27 -> 6:28 = #4, (#3 - #4) [2,0,0]
+// CHECK-LABEL: Decision,File 0, 7:17 -> 7:33 = M:2, C:4
+// CHECK-NEXT: Branch,File 0, 7:17 -> 7:18 = #11, (#0 - #11) [1,4,0]
+// CHECK: Branch,File 0, 7:22 -> 7:23 = #12, (#11 - #12) [4,3,0]
+// CHECK: Branch,File 0, 7:27 -> 7:28 = #10, (#9 - #10) [3,2,0]
+// CHECK: Branch,File 0, 7:32 -> 7:33 = #8, (#7 - #8) [2,0,0]
+// CHECK-LABEL: Decision,File 0, 8:17 -> 8:38 = M:4, C:5
+// CHECK-NEXT: Branch,File 0, 8:17 -> 8:18 = #19, (#0 - #19) [1,5,0]
+// CHECK: Branch,File 0, 8:22 -> 8:23 = #20, (#19 - #20) [5,4,0]
+// CHECK: Branch,File 0, 8:27 -> 8:28 = #18, (#17 - #18) [4,3,0]
+// CHECK: Branch,File 0, 8:32 -> 8:33 = #16, (#15 - #16) [3,2,0]
+// CHECK: Branch,File 0, 8:37 -> 8:38 = #14, (#13 - #14) [2,0,0]
+// CHECK-LABEL: Decision,File 0, 9:17 -> 9:43 = M:8, C:6
+// CHECK-NEXT: Branch,File 0, 9:17 -> 9:18 = #29, (#0 - #29) [1,6,0]
+// CHECK: Branch,File 0, 9:22 -> 9:23 = #30, (#29 - #30) [6,5,0]
+// CHECK: Branch,File 0, 9:27 -> 9:28 = #28, (#27 - #28) [5,4,0]
+// CHECK: Branch,File 0, 9:32 -> 9:33 = #26, (#25 - #26) [4,3,0]
+// CHECK: Branch,File 0, 9:37 -> 9:38 = #24, (#23 - #24) [3,2,0]
+// CHECK: Branch,File 0, 9:42 -> 9:43 = #22, (#21 - #22) [2,0,0]
+
+bool func_scalar_or(bool a, bool b, bool c, bool d, bool e, bool f) {
+    bool res1 = a || b;
+    bool res2 = a || b || c;
+    bool res3 = a || b || c || d;
+    bool res4 = a || b || c || d || e;
+    bool res5 = a || b || c || d || e || f;
+    return bar(res1, res2, res3, res4, res5);
+}
+
+// CHECK-LABEL: Decision,File 0, 40:17 -> 40:23 = M:0, C:2
+// CHECK-NEXT: Branch,File 0, 40:17 -> 40:18 = (#0 - #1), #1 [1,0,2]
+// CHECK: Branch,File 0, 40:22 -> 40:23 = (#1 - #2), #2 [2,0,0]
+// CHECK-LABEL: Decision,File 0, 41:17 -> 41:28 = M:1, C:3
+// CHECK-NEXT: Branch,File 0, 41:17 -> 41:18 = (#0 - #5), #5 [1,0,3]
+// CHECK: Branch,File 0, 41:22 -> 41:23 = (#5 - #6), #6 [3,0,2]
+// CHECK: Branch,File 0, 41:27 -> 41:28 = (#3 - #4), #4 [2,0,0]
+// CHECK-LABEL: Decision,File 0, 42:17 -> 42:33 = M:2, C:4
+// CHECK-NEXT: Branch,File 0, 42:17 -> 42:18 = (#0 - #11), #11 [1,0,4]
+// CHECK: Branch,File 0, 42:22 -> 42:23 = (#11 - #12), #12 [4,0,3]
+// CHECK: Branch,File 0, 42:27 -> 42:28 = (#9 - #10), #10 [3,0,2]
+// CHECK: Branch,File 0, 42:32 -> 42:33 = (#7 - #8), #8 [2,0,0]
+// CHECK-LABEL: Decision,File 0, 43:17 -> 43:38 = M:4, C:5
+// CHECK-NEXT: Branch,File 0, 43:17 -> 43:18 = (#0 - #19), #19 [1,0,5]
+// CHECK: Branch,File 0, 43:22 -> 43:23 = (#19 - #20), #20 [5,0,4]
+// CHECK: Branch,File 0, 43:27 -> 43:28 = (#17 - #18), #18 [4,0,3]
+// CHECK: Branch,File 0, 43:32 -> 43:33 = (#15 - #16), #16 [3,0,2]
+// CHECK: Branch,File 0, 43:37 -> 43:38 = (#13 - #14), #14 [2,0,0]
+// CHECK-LABEL: Decision,File 0, 44:17 -> 44:43 = M:8, C:6
+// CHECK-NEXT: Branch,File 0, 44:17 -> 44:18 = (#0 - #29), #29 [1,0,6]
+// CHECK: Branch,File 0, 44:22 -> 44:23 = (#29 - #30), #30 [6,0,5]
+// CHECK: Branch,File 0, 44:27 -> 44:28 = (#27 - #28), #28 [5,0,4]
+// CHECK: Branch,File 0, 44:32 -> 44:33 = (#25 - #26), #26 [4,0,3]
+// CHECK: Branch,File 0, 44:37 -> 44:38 = (#23 - #24), #24 [3,0,2]
+// CHECK: Branch,File 0, 44:42 -> 44:43 = (#21 - #22), #22 [2,0,0]
+
+
+bool func_scalar_mix(bool a, bool b, bool c, bool d, bool e, bool f) {
+    bool res1 = a || b;
+    bool res2 = a && (b || c);
+    bool res3 = (a || b) && (c || d);
+    bool res4 = a && (b || c) && (d || e);
+    bool res5 = (a || b) && (c || d) && (e || f);
+    return bar(res1, res2, res3, res4, res5);
+}
+
+// CHECK-LABEL:  Decision,File 0, 76:17 -> 76:23 = M:0, C:2
+// CHECK-NEXT:  Branch,File 0, 76:17 -> 76:18 = (#0 - #1), #1 [1,0,2]
+// CHECK:  Branch,File 0, 76:22 -> 76:23 = (#1 - #2), #2 [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 77:17 -> 77:30 = M:1, C:3
+// CHECK-NEXT:  Branch,File 0, 77:17 -> 77:18 = #3, (#0 - #3) [1,2,0]
+// CHECK:  Branch,File 0, 77:23 -> 77:24 = (#3 - #4), #4 [2,0,3]
+// CHECK:  Branch,File 0, 77:28 -> 77:29 = (#4 - #5), #5 [3,0,0]
+// CHECK-LABEL:  Decision,File 0, 78:17 -> 78:37 = M:2, C:4
+// CHECK-NEXT:  File 0
+// CHECK-NEXT:  Branch,File 0, 78:18 -> 78:19 = (#0 - #7), #7 [1,2,3]
+// CHECK:  Branch,File 0, 78:23 -> 78:24 = (#7 - #8), #8 [3,2,0]
+// CHECK:  Branch,File 0, 78:30 -> 78:31 = (#6 - #9), #9 [2,0,4]
+// CHECK:  Branch,File 0, 78:35 -> 78:36 = (#9 - #10), #10 [4,0,0]
+// CHECK-LABEL:  Decision,File 0, 79:17 -> 79:42 = M:4, C:5
+// CHECK-NEXT:  Branch,File 0, 79:17 -> 79:18 = #12, (#0 - #12) [1,3,0]
+// CHECK:  Branch,File 0, 79:23 -> 79:24 = (#12 - #13), #13 [3,2,4]
+// CHECK:  Branch,File 0, 79:28 -> 79:29 = (#13 - #14), #14 [4,2,0]
+// CHECK:  Branch,File 0, 79:35 -> 79:36 = (#11 - #15), #15 [2,0,5]
+// CHECK:  Branch,File 0, 79:40 -> 79:41 = (#15 - #16), #16 [5,0,0]
+// CHECK-LABEL:  Decision,File 0, 80:17 -> 80:49 = M:8, C:6
+// CHECK-NEXT:  File 0
+// CHECK-NEXT:  Branch,File 0, 80:18 -> 80:19 = (#0 - #19), #19 [1,3,4]
+// CHECK:  Branch,File 0, 80:23 -> 80:24 = (#19 - #20), #20 [4,3,0]
+// CHECK:  Branch,File 0, 80:30 -> 80:31 = (#18 - #21), #21 [3,2,5]
+// CHECK:  Branch,File 0, 80:35 -> 80:36 = (#21 - #22), #22 [5,2,0]
+// CHECK:  Branch,File 0, 80:42 -> 80:43 = (#17 - #23), #23 [2,0,6]
+// CHECK:  Branch,File 0, 80:47 -> 80:48 = (#23 - #24), #24 [6,0,0]
diff --git a/clang/test/CoverageMapping/mcdc-logical-stmt-ids-all.cpp b/clang/test/CoverageMapping/mcdc-logical-stmt-ids-all.cpp
new file mode 100644
index 0000000000000..5b13cc3507e96
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-logical-stmt-ids-all.cpp
@@ -0,0 +1,131 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+bool func_if_and(bool a, bool b, bool c, bool d, bool e, bool f) {
+  if (a && b && c && d && e && f)
+    return true;
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 4:7 -> 4:33 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 4:7 -> 4:8 = #10, (#0 - #10) [1,6,0]
+// CHECK:  Branch,File 0, 4:12 -> 4:13 = #11, (#10 - #11) [6,5,0]
+// CHECK:  Branch,File 0, 4:17 -> 4:18 = #9, (#8 - #9) [5,4,0]
+// CHECK:  Branch,File 0, 4:22 -> 4:23 = #7, (#6 - #7) [4,3,0]
+// CHECK:  Branch,File 0, 4:27 -> 4:28 = #5, (#4 - #5) [3,2,0]
+// CHECK:  Branch,File 0, 4:32 -> 4:33 = #3, (#2 - #3) [2,0,0]
+
+bool func_if_or(bool a, bool b, bool c, bool d, bool e, bool f) {
+  if (a || b || c || d || e || f)
+    return true;
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 18:7 -> 18:33 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 18:7 -> 18:8 = (#0 - #10), #10 [1,0,6]
+// CHECK:  Branch,File 0, 18:12 -> 18:13 = (#10 - #11), #11 [6,0,5]
+// CHECK:  Branch,File 0, 18:17 -> 18:18 = (#8 - #9), #9 [5,0,4]
+// CHECK:  Branch,File 0, 18:22 -> 18:23 = (#6 - #7), #7 [4,0,3]
+// CHECK:  Branch,File 0, 18:27 -> 18:28 = (#4 - #5), #5 [3,0,2]
+// CHECK:  Branch,File 0, 18:32 -> 18:33 = (#2 - #3), #3 [2,0,0]
+
+bool func_while_and(bool a, bool b, bool c, bool d, bool e, bool f) {
+  while (a && b && c && d && e && f) { return true; }
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 32:10 -> 32:36 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 32:10 -> 32:11 = #10, (#0 - #10) [1,6,0]
+// CHECK:  Branch,File 0, 32:15 -> 32:16 = #11, (#10 - #11) [6,5,0]
+// CHECK:  Branch,File 0, 32:20 -> 32:21 = #9, (#8 - #9) [5,4,0]
+// CHECK:  Branch,File 0, 32:25 -> 32:26 = #7, (#6 - #7) [4,3,0]
+// CHECK:  Branch,File 0, 32:30 -> 32:31 = #5, (#4 - #5) [3,2,0]
+// CHECK:  Branch,File 0, 32:35 -> 32:36 = #3, (#2 - #3) [2,0,0]
+
+bool func_while_or(bool a, bool b, bool c, bool d, bool e, bool f) {
+  while (a || b || c || d || e || f) { return true; }
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 45:10 -> 45:36 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 45:10 -> 45:11 = (#0 - #10), #10 [1,0,6]
+// CHECK:  Branch,File 0, 45:15 -> 45:16 = (#10 - #11), #11 [6,0,5]
+// CHECK:  Branch,File 0, 45:20 -> 45:21 = (#8 - #9), #9 [5,0,4]
+// CHECK:  Branch,File 0, 45:25 -> 45:26 = (#6 - #7), #7 [4,0,3]
+// CHECK:  Branch,File 0, 45:30 -> 45:31 = (#4 - #5), #5 [3,0,2]
+// CHECK:  Branch,File 0, 45:35 -> 45:36 = (#2 - #3), #3 [2,0,0]
+
+bool func_for_and(bool a, bool b, bool c, bool d, bool e, bool f) {
+  for (;a && b && c && d && e && f;) { return true; }
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 58:9 -> 58:35 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 58:9 -> 58:10 = #10, (#0 - #10) [1,6,0]
+// CHECK:  Branch,File 0, 58:14 -> 58:15 = #11, (#10 - #11) [6,5,0]
+// CHECK:  Branch,File 0, 58:19 -> 58:20 = #9, (#8 - #9) [5,4,0]
+// CHECK:  Branch,File 0, 58:24 -> 58:25 = #7, (#6 - #7) [4,3,0]
+// CHECK:  Branch,File 0, 58:29 -> 58:30 = #5, (#4 - #5) [3,2,0]
+// CHECK:  Branch,File 0, 58:34 -> 58:35 = #3, (#2 - #3) [2,0,0]
+
+bool func_for_or(bool a, bool b, bool c, bool d, bool e, bool f) {
+  for (;a || b || c || d || e || f;) { return true; }
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 71:9 -> 71:35 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 71:9 -> 71:10 = (#0 - #10), #10 [1,0,6]
+// CHECK:  Branch,File 0, 71:14 -> 71:15 = (#10 - #11), #11 [6,0,5]
+// CHECK:  Branch,File 0, 71:19 -> 71:20 = (#8 - #9), #9 [5,0,4]
+// CHECK:  Branch,File 0, 71:24 -> 71:25 = (#6 - #7), #7 [4,0,3]
+// CHECK:  Branch,File 0, 71:29 -> 71:30 = (#4 - #5), #5 [3,0,2]
+// CHECK:  Branch,File 0, 71:34 -> 71:35 = (#2 - #3), #3 [2,0,0]
+
+bool func_do_and(bool a, bool b, bool c, bool d, bool e, bool f) {
+  do {} while (a && b && c && d && e && f);
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 84:16 -> 84:42 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 84:16 -> 84:17 = #10, ((#0 + #1) - #10) [1,6,0]
+// CHECK:  Branch,File 0, 84:21 -> 84:22 = #11, (#10 - #11) [6,5,0]
+// CHECK:  Branch,File 0, 84:26 -> 84:27 = #9, (#8 - #9) [5,4,0]
+// CHECK:  Branch,File 0, 84:31 -> 84:32 = #7, (#6 - #7) [4,3,0]
+// CHECK:  Branch,File 0, 84:36 -> 84:37 = #5, (#4 - #5) [3,2,0]
+// CHECK:  Branch,File 0, 84:41 -> 84:42 = #3, (#2 - #3) [2,0,0]
+
+bool func_do_or(bool a, bool b, bool c, bool d, bool e, bool f) {
+  do {} while (a || b || c || d || e || f);
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 97:16 -> 97:42 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 97:16 -> 97:17 = ((#0 + #1) - #10), #10 [1,0,6]
+// CHECK:  Branch,File 0, 97:21 -> 97:22 = (#10 - #11), #11 [6,0,5]
+// CHECK:  Branch,File 0, 97:26 -> 97:27 = (#8 - #9), #9 [5,0,4]
+// CHECK:  Branch,File 0, 97:31 -> 97:32 = (#6 - #7), #7 [4,0,3]
+// CHECK:  Branch,File 0, 97:36 -> 97:37 = (#4 - #5), #5 [3,0,2]
+// CHECK:  Branch,File 0, 97:41 -> 97:42 = (#2 - #3), #3 [2,0,0]
+
+bool func_ternary_and(bool a, bool b, bool c, bool d, bool e, bool f) {
+  return (a && b && c && d && e && f) ? true : false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 110:11 -> 110:37 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 110:11 -> 110:12 = #10, (#0 - #10) [1,6,0]
+// CHECK:  Branch,File 0, 110:16 -> 110:17 = #11, (#10 - #11) [6,5,0]
+// CHECK:  Branch,File 0, 110:21 -> 110:22 = #9, (#8 - #9) [5,4,0]
+// CHECK:  Branch,File 0, 110:26 -> 110:27 = #7, (#6 - #7) [4,3,0]
+// CHECK:  Branch,File 0, 110:31 -> 110:32 = #5, (#4 - #5) [3,2,0]
+// CHECK:  Branch,File 0, 110:36 -> 110:37 = #3, (#2 - #3) [2,0,0]
+
+bool func_ternary_or(bool a, bool b, bool c, bool d, bool e, bool f) {
+  return (a || b || c || d || e || f) ? true : false;
+}
+
+// CHECK-LABEL: Decision,File 0, 122:11 -> 122:37 = M:0, C:6
+// CHECK-NEXT:  Branch,File 0, 122:11 -> 122:12 = (#0 - #10), #10 [1,0,6]
+// CHECK:  Branch,File 0, 122:16 -> 122:17 = (#10 - #11), #11 [6,0,5]
+// CHECK:  Branch,File 0, 122:21 -> 122:22 = (#8 - #9), #9 [5,0,4]
+// CHECK:  Branch,File 0, 122:26 -> 122:27 = (#6 - #7), #7 [4,0,3]
+// CHECK:  Branch,File 0, 122:31 -> 122:32 = (#4 - #5), #5 [3,0,2]
+// CHECK:  Branch,File 0, 122:36 -> 122:37 = (#2 - #3), #3 [2,0,0]
diff --git a/clang/test/CoverageMapping/mcdc-logical-stmt-ids.cpp b/clang/test/CoverageMapping/mcdc-logical-stmt-ids.cpp
new file mode 100644
index 0000000000000..99854ec27a3fb
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-logical-stmt-ids.cpp
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+bool func_if_and(bool a, bool b, bool c, bool d, bool e, bool f) {
+  if (a && b)
+    if (a && b && c)
+      if (a && b && c && d)
+        if (a && b && c && d && e)
+           if (a && b && c && d && e && f)
+              return true;
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 4:7 -> 4:13 = M:0, C:2
+// CHECK-NEXT:  Branch,File 0, 4:7 -> 4:8 = #2, (#0 - #2) [1,2,0]
+// CHECK:  Branch,File 0, 4:12 -> 4:13 = #3, (#2 - #3) [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 5:9 -> 5:20 = M:1, C:3
+// CHECK-NEXT:  Branch,File 0, 5:9 -> 5:10 = #7, (#1 - #7) [1,3,0]
+// CHECK:  Branch,File 0, 5:14 -> 5:15 = #8, (#7 - #8) [3,2,0]
+// CHECK:  Branch,File 0, 5:19 -> 5:20 = #6, (#5 - #6) [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 6:11 -> 6:27 = M:2, C:4
+// CHECK-NEXT:  Branch,File 0, 6:11 -> 6:12 = #14, (#4 - #14) [1,4,0]
+// CHECK:  Branch,File 0, 6:16 -> 6:17 = #15, (#14 - #15) [4,3,0]
+// CHECK:  Branch,File 0, 6:21 -> 6:22 = #13, (#12 - #13) [3,2,0]
+// CHECK:  Branch,File 0, 6:26 -> 6:27 = #11, (#10 - #11) [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 7:13 -> 7:34 = M:4, C:5
+// CHECK-NEXT:  Branch,File 0, 7:13 -> 7:14 = #23, (#9 - #23) [1,5,0]
+// CHECK:  Branch,File 0, 7:18 -> 7:19 = #24, (#23 - #24) [5,4,0]
+// CHECK:  Branch,File 0, 7:23 -> 7:24 = #22, (#21 - #22) [4,3,0]
+// CHECK:  Branch,File 0, 7:28 -> 7:29 = #20, (#19 - #20) [3,2,0]
+// CHECK:  Branch,File 0, 7:33 -> 7:34 = #18, (#17 - #18) [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 8:16 -> 8:42 = M:8, C:6
+// CHECK-NEXT:  Branch,File 0, 8:16 -> 8:17 = #34, (#16 - #34) [1,6,0]
+// CHECK:  Branch,File 0, 8:21 -> 8:22 = #35, (#34 - #35) [6,5,0]
+// CHECK:  Branch,File 0, 8:26 -> 8:27 = #33, (#32 - #33) [5,4,0]
+// CHECK:  Branch,File 0, 8:31 -> 8:32 = #31, (#30 - #31) [4,3,0]
+// CHECK:  Branch,File 0, 8:36 -> 8:37 = #29, (#28 - #29) [3,2,0]
+// CHECK:  Branch,File 0, 8:41 -> 8:42 = #27, (#26 - #27) [2,0,0]
+
+bool func_if_or(bool a, bool b, bool c, bool d, bool e, bool f) {
+  if (a || b)
+    if (a || b || c)
+      if (a || b || c || d)
+        if (a || b || c || d || e)
+           if (a || b || c || d || e || f)
+              return true;
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 40:7 -> 40:13 = M:0, C:2
+// CHECK-NEXT:  Branch,File 0, 40:7 -> 40:8 = (#0 - #2), #2 [1,0,2]
+// CHECK:  Branch,File 0, 40:12 -> 40:13 = (#2 - #3), #3 [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 41:9 -> 41:20 = M:1, C:3
+// CHECK-NEXT:  Branch,File 0, 41:9 -> 41:10 = (#1 - #7), #7 [1,0,3]
+// CHECK:  Branch,File 0, 41:14 -> 41:15 = (#7 - #8), #8 [3,0,2]
+// CHECK:  Branch,File 0, 41:19 -> 41:20 = (#5 - #6), #6 [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 42:11 -> 42:27 = M:2, C:4
+// CHECK-NEXT:  Branch,File 0, 42:11 -> 42:12 = (#4 - #14), #14 [1,0,4]
+// CHECK:  Branch,File 0, 42:16 -> 42:17 = (#14 - #15), #15 [4,0,3]
+// CHECK:  Branch,File 0, 42:21 -> 42:22 = (#12 - #13), #13 [3,0,2]
+// CHECK:  Branch,File 0, 42:26 -> 42:27 = (#10 - #11), #11 [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 43:13 -> 43:34 = M:4, C:5
+// CHECK-NEXT:  Branch,File 0, 43:13 -> 43:14 = (#9 - #23), #23 [1,0,5]
+// CHECK:  Branch,File 0, 43:18 -> 43:19 = (#23 - #24), #24 [5,0,4]
+// CHECK:  Branch,File 0, 43:23 -> 43:24 = (#21 - #22), #22 [4,0,3]
+// CHECK:  Branch,File 0, 43:28 -> 43:29 = (#19 - #20), #20 [3,0,2]
+// CHECK:  Branch,File 0, 43:33 -> 43:34 = (#17 - #18), #18 [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 44:16 -> 44:42 = M:8, C:6
+// CHECK-NEXT:  Branch,File 0, 44:16 -> 44:17 = (#16 - #34), #34 [1,0,6]
+// CHECK:  Branch,File 0, 44:21 -> 44:22 = (#34 - #35), #35 [6,0,5]
+// CHECK:  Branch,File 0, 44:26 -> 44:27 = (#32 - #33), #33 [5,0,4]
+// CHECK:  Branch,File 0, 44:31 -> 44:32 = (#30 - #31), #31 [4,0,3]
+// CHECK:  Branch,File 0, 44:36 -> 44:37 = (#28 - #29), #29 [3,0,2]
+// CHECK:  Branch,File 0, 44:41 -> 44:42 = (#26 - #27), #27 [2,0,0]
+
+bool func_if_mix(bool a, bool b, bool c, bool d, bool e, bool f) {
+  if (a || b)
+    if (a && (b || c))
+      if ((a || b) && (c || d))
+        if (a && (b || c) && (d || e))
+          if ((a || b) && (c || d) && (e || f))
+            return true;
+  return false;
+}
+
+// CHECK-LABEL:  Decision,File 0, 76:7 -> 76:13 = M:0, C:2
+// CHECK-NEXT:  Branch,File 0, 76:7 -> 76:8 = (#0 - #2), #2 [1,0,2]
+// CHECK:  Branch,File 0, 76:12 -> 76:13 = (#2 - #3), #3 [2,0,0]
+// CHECK-LABEL:  Decision,File 0, 77:9 -> 77:22 = M:1, C:3
+// CHECK-NEXT:  Branch,File 0, 77:9 -> 77:10 = #5, (#1 - #5) [1,2,0]
+// CHECK:  Branch,File 0, 77:15 -> 77:16 = (#5 - #6), #6 [2,0,3]
+// CHECK:  Branch,File 0, 77:20 -> 77:21 = (#6 - #7), #7 [3,0,0]
+// CHECK-LABEL:  Decision,File 0, 78:11 -> 78:31 = M:2, C:4
+// CHECK-NEXT:  File 0
+// CHECK-NEXT:  Branch,File 0, 78:12 -> 78:13 = (#4 - #10), #10 [1,2,3]
+// CHECK:  Branch,File 0, 78:17 -> 78:18 = (#10 - #11), #11 [3,2,0]
+// CHECK:  Branch,File 0, 78:24 -> 78:25 = (#9 - #12), #12 [2,0,4]
+// CHECK:  Branch,File 0, 78:29 -> 78:30 = (#12 - #13), #13 [4,0,0]
+// CHECK-LABEL:  Decision,File 0, 79:13 -> 79:38 = M:4, C:5
+// CHECK-NEXT:  Branch,File 0, 79:13 -> 79:14 = #16, (#8 - #16) [1,3,0]
+// CHECK:  Branch,File 0, 79:19 -> 79:20 = (#16 - #17), #17 [3,2,4]
+// CHECK:  Branch,File 0, 79:24 -> 79:25 = (#17 - #18), #18 [4,2,0]
+// CHECK:  Branch,File 0, 79:31 -> 79:32 = (#15 - #19), #19 [2,0,5]
+// CHECK:  Branch,File 0, 79:36 -> 79:37 = (#19 - #20), #20 [5,0,0]
+// CHECK-LABEL:  Decision,File 0, 80:15 -> 80:47 = M:8, C:6
+// CHECK-NEXT:  File 0
+// CHECK-NEXT:  Branch,File 0, 80:16 -> 80:17 = (#14 - #24), #24 [1,3,4]
+// CHECK:  Branch,File 0, 80:21 -> 80:22 = (#24 - #25), #25 [4,3,0]
+// CHECK:  Branch,File 0, 80:28 -> 80:29 = (#23 - #26), #26 [3,2,5]
+// CHECK:  Branch,File 0, 80:33 -> 80:34 = (#26 - #27), #27 [5,2,0]
+// CHECK:  Branch,File 0, 80:40 -> 80:41 = (#22 - #28), #28 [2,0,6]
+// CHECK:  Branch,File 0, 80:45 -> 80:46 = (#28 - #29), #29 [6,0,0]
diff --git a/clang/test/Profile/c-linkage-available_externally.c b/clang/test/Profile/c-linkage-available_externally.c
index f6375e15f451a..010f191dc5e1b 100644
--- a/clang/test/Profile/c-linkage-available_externally.c
+++ b/clang/test/Profile/c-linkage-available_externally.c
@@ -1,8 +1,10 @@
 // Make sure instrumentation data from available_externally functions doesn't
 // get thrown out and are emitted with the expected linkage.
 // RUN: %clang_cc1 -O2 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage-available_externally.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
+// RUN: %clang_cc1 -fcoverage-mcdc -O2 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage-available_externally.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s -check-prefix=MCDC
 
 // CHECK: @__profc_foo = linkonce_odr hidden global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
+// MCDC: @__profbm_foo = linkonce_odr hidden global [0 x i8] zeroinitializer, section "__DATA,__llvm_prf_bits", align 1
 // CHECK: @__profd_foo = linkonce_odr hidden global {{.*}} i64 sub (i64 ptrtoint (ptr @__profc_foo to i64), i64 ptrtoint (ptr @__profd_foo to i64)), {{.*}}, section "__DATA,__llvm_prf_data,regular,live_support", align 8
 inline int foo(void) { return 1; }
 
diff --git a/clang/test/Profile/c-mcdc-class.cpp b/clang/test/Profile/c-mcdc-class.cpp
new file mode 100644
index 0000000000000..2206a39ee4ffb
--- /dev/null
+++ b/clang/test/Profile/c-mcdc-class.cpp
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -fcoverage-mcdc | FileCheck %s -check-prefix=MCDCCTOR
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -fcoverage-mcdc | FileCheck %s -check-prefix=MCDCDTOR
+
+extern void foo();
+extern void bar();
+class Value {
+  public:
+    void setValue(int len);
+    int getValue(void);
+    Value();   // This is the constructor declaration
+    ~Value();  // This is the destructor declaration
+
+  private:
+    int value;
+};
+
+// Member functions definitions including constructor
+Value::Value(void) {
+  if (value != 2 || value != 6)
+    foo();
+}
+Value::~Value(void) {
+  if (value != 2 || value != 3)
+    bar();
+}
+
+// MCDC BOOKKEEPING.
+// MCDCCTOR: @__profbm__ZN5ValueC2Ev = private global [1 x i8] zeroinitializer
+// MCDCCTOR: @__profc__ZN5ValueC2Ev = private global [4 x i64] zeroinitializer
+
+// ALLOCATE MCDC TEMP AND ZERO IT.
+// MCDCCTOR-LABEL: @_ZN5ValueC2Ev(
+// MCDCCTOR: %mcdc.addr = alloca i32, align 4
+// MCDCCTOR: store i32 0, ptr %mcdc.addr, align 4
+
+// SHIFT FIRST CONDITION WITH ID = 0.
+// MCDCCTOR:  %[[LAB1:[0-9]+]] = load i32, ptr %value, align 4
+// MCDCCTOR-DAG:  %[[BOOL:cmp[0-9]*]] = icmp ne i32 %[[LAB1]], 2
+// MCDCCTOR-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDCCTOR-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDCCTOR-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 0
+// MCDCCTOR-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDCCTOR-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT SECOND CONDITION WITH ID = 1.
+// MCDCCTOR:  %[[LAB1:[0-9]+]] = load i32, ptr %value2, align 4
+// MCDCCTOR-DAG:  %[[BOOL:cmp[0-9]*]] = icmp ne i32 %[[LAB1]], 6
+// MCDCCTOR-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDCCTOR-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDCCTOR-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 1
+// MCDCCTOR-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDCCTOR-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// UPDATE FINAL BITMASK WITH RESULT.
+// MCDCCTOR-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDCCTOR:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDCCTOR:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDCCTOR:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm__ZN5ValueC2Ev to i64), %[[LAB2]]
+// MCDCCTOR:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDCCTOR:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDCCTOR:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDCCTOR:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDCCTOR:  %mcdc.bits = load i8, ptr %[[LAB4]], align 1
+// MCDCCTOR:  %[[LAB8:[0-9]+]] = or i8 %mcdc.bits, %[[LAB7]]
+// MCDCCTOR:  store i8 %[[LAB8]], ptr %[[LAB4]], align 1
+
+// MCDCDTOR: @__profbm__ZN5ValueD2Ev = private global [1 x i8] zeroinitializer
+// MCDCDTOR: @__profc__ZN5ValueD2Ev = private global [4 x i64] zeroinitializer
+
+// ALLOCATE MCDC TEMP AND ZERO IT.
+// MCDCDTOR-LABEL: @_ZN5ValueD2Ev(
+// MCDCDTOR: %mcdc.addr = alloca i32, align 4
+// MCDCDTOR: store i32 0, ptr %mcdc.addr, align 4
+
+// SHIFT FIRST CONDITION WITH ID = 0.
+// MCDCDTOR:  %[[LAB1:[0-9]+]] = load i32, ptr %value, align 4
+// MCDCDTOR-DAG:  %[[BOOL:cmp[0-9]*]] = icmp ne i32 %[[LAB1]], 2
+// MCDCDTOR-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDCDTOR-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDCDTOR-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 0
+// MCDCDTOR-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDCDTOR-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT SECOND CONDITION WITH ID = 1.
+// MCDCDTOR:  %[[LAB1:[0-9]+]] = load i32, ptr %value2, align 4
+// MCDCDTOR-DAG:  %[[BOOL:cmp[0-9]*]] = icmp ne i32 %[[LAB1]], 3
+// MCDCDTOR-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDCDTOR-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDCDTOR-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 1
+// MCDCDTOR-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDCDTOR-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// UPDATE FINAL BITMASK WITH RESULT.
+// MCDCDTOR-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDCDTOR:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDCDTOR:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDCDTOR:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm__ZN5ValueD2Ev to i64), %[[LAB2]]
+// MCDCDTOR:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDCDTOR:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDCDTOR:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDCDTOR:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDCDTOR:  %mcdc.bits = load i8, ptr %[[LAB4]], align 1
+// MCDCDTOR:  %[[LAB8:[0-9]+]] = or i8 %mcdc.bits, %[[LAB7]]
+// MCDCDTOR:  store i8 %[[LAB8]], ptr %[[LAB4]], align 1
diff --git a/clang/test/Profile/c-mcdc-nested-ternary.c b/clang/test/Profile/c-mcdc-nested-ternary.c
new file mode 100644
index 0000000000000..4b014e07f6df5
--- /dev/null
+++ b/clang/test/Profile/c-mcdc-nested-ternary.c
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -fcoverage-mcdc | FileCheck %s -check-prefix=MCDC
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck %s -check-prefix=NOMCDC
+
+int test(int b, int c, int d, int e, int f) {
+  return ((b ? c : d) && e && f);
+}
+
+// NOMCDC-NOT: %mcdc.addr
+// NOMCDC-NOT: __profbm_test
+
+// MCDC BOOKKEEPING.
+// MCDC: @__profbm_test = private global [1 x i8] zeroinitializer
+
+// ALLOCATE MCDC TEMP AND ZERO IT.
+// MCDC-LABEL: @test(
+// MCDC: %mcdc.addr = alloca i32, align 4
+// MCDC: store i32 0, ptr %mcdc.addr, align 4
+
+// TERNARY TRUE SHOULD SHIFT ID = 0 FOR CONDITION 'c'.
+// MCDC-LABEL: cond.true:
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %c.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 0
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// TERNARY FALSE SHOULD SHIFT ID = 0 FOR CONDITION 'd'.
+// MCDC-LABEL: cond.false:
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %d.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 0
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT SECOND CONDITION WITH ID = 2.
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %e.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 2
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT THIRD CONDITION WITH ID = 1.
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %f.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 1
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// UPDATE FINAL BITMASK WITH RESULT.
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
+// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDC:  %mcdc.bits = load i8, ptr %[[LAB4]], align 1
+// MCDC:  %[[LAB8:[0-9]+]] = or i8 %mcdc.bits, %[[LAB7]]
+// MCDC:  store i8 %[[LAB8]], ptr %[[LAB4]], align 1
diff --git a/clang/test/Profile/c-mcdc-not.c b/clang/test/Profile/c-mcdc-not.c
new file mode 100644
index 0000000000000..aa638b9680b84
--- /dev/null
+++ b/clang/test/Profile/c-mcdc-not.c
@@ -0,0 +1,88 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -fcoverage-mcdc | FileCheck %s -check-prefix=MCDC
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck %s -check-prefix=NOMCDC
+
+int test(int a, int b, int c, int d, int e, int f) {
+  return ((!a && b) || ((!c && d) || (e && !f)));
+}
+
+// NOMCDC-NOT: %mcdc.addr
+// NOMCDC-NOT: __profbm_test
+
+// MCDC BOOKKEEPING.
+// MCDC: @__profbm_test = private global [8 x i8] zeroinitializer
+// MCDC: @__profc_test = private global [9 x i64] zeroinitializer
+
+// ALLOCATE MCDC TEMP AND ZERO IT.
+// MCDC-LABEL: @test(
+// MCDC: %mcdc.addr = alloca i32, align 4
+// MCDC: store i32 0, ptr %mcdc.addr, align 4
+
+// SHIFT FIRST CONDITION WITH ID = 0.
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %a.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[LNOT:lnot[0-9]*]] = xor i1 %[[BOOL]]
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[LNOT]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 0
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT SECOND CONDITION WITH ID = 2.
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %b.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 2
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT THIRD CONDITION WITH ID = 1.
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %c.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[LNOT:lnot[0-9]*]] = xor i1 %[[BOOL]]
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[LNOT]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 1
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT FOURTH CONDITION WITH ID = 4.
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %d.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 4
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT FIFTH CONDITION WITH ID = 3.
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %e.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 3
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT SIXTH CONDITION WITH ID = 5.
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %f.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[LNOT:lnot[0-9]*]] = xor i1 %[[BOOL]]
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[LNOT]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 5
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// UPDATE FINAL BITMASK WITH RESULT.
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
+// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDC:  %mcdc.bits = load i8, ptr %[[LAB4]], align 1
+// MCDC:  %[[LAB8:[0-9]+]] = or i8 %mcdc.bits, %[[LAB7]]
+// MCDC:  store i8 %[[LAB8]], ptr %[[LAB4]], align 1
diff --git a/clang/test/Profile/c-mcdc.c b/clang/test/Profile/c-mcdc.c
new file mode 100644
index 0000000000000..ac845d204853d
--- /dev/null
+++ b/clang/test/Profile/c-mcdc.c
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -fcoverage-mcdc | FileCheck %s -check-prefix=MCDC
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck %s -check-prefix=NOMCDC
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -fcoverage-mcdc -disable-llvm-passes | FileCheck %s -check-prefix=NOPROFPASS
+
+int test(int a, int b, int c, int d, int e, int f) {
+  return ((a && b) || ((c && d) || (e && f)));
+}
+
+// NOMCDC-NOT: %mcdc.addr
+// NOMCDC-NOT: __profbm_test
+// NOPROFPASS-NOT: __profbm_test
+
+// MCDC BOOKKEEPING.
+// MCDC: @__profbm_test = private global [8 x i8] zeroinitializer
+// MCDC: @__profc_test = private global [9 x i64] zeroinitializer
+
+// ALLOCATE MCDC TEMP AND ZERO IT.
+// NOPROFPASS-LABEL: @test(
+// NOPROFPASS: call void @llvm.instrprof.mcdc.parameters(ptr @__profn_test, i64 [[HASH:[0-9]+]], i32 8)
+// MCDC-LABEL: @test(
+// MCDC: %mcdc.addr = alloca i32, align 4
+// MCDC: store i32 0, ptr %mcdc.addr, align 4
+
+// SHIFT FIRST CONDITION WITH ID = 0.
+// NOPROFPASS: call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 [[HASH]], i32 0, ptr %mcdc.addr, i1 %tobool{{[0-9]*}})
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %a.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 0
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT SECOND CONDITION WITH ID = 2.
+// NOPROFPASS-LABEL: land.lhs.true:
+// NOPROFPASS: call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 [[HASH]], i32 2, ptr %mcdc.addr, i1 %tobool{{[0-9]*}})
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %b.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 2
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT THIRD CONDITION WITH ID = 1.
+// NOPROFPASS-LABEL: lor.rhs:
+// NOPROFPASS: call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 [[HASH]], i32 1, ptr %mcdc.addr, i1 %tobool{{[0-9]*}})
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %c.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 1
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT FOURTH CONDITION WITH ID = 4.
+// NOPROFPASS-LABEL: land.lhs.true3:
+// NOPROFPASS: call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 [[HASH]], i32 4, ptr %mcdc.addr, i1 %tobool{{[0-9]*}})
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %d.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 4
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT FIFTH CONDITION WITH ID = 3.
+// NOPROFPASS-LABEL: lor.rhs6:
+// NOPROFPASS: call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 [[HASH]], i32 3, ptr %mcdc.addr, i1 %tobool{{[0-9]*}})
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %e.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 3
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// SHIFT SIXTH CONDITION WITH ID = 5.
+// NOPROFPASS-LABEL: land.rhs:
+// NOPROFPASS: call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 [[HASH]], i32 5, ptr %mcdc.addr, i1 %tobool{{[0-9]*}})
+// MCDC:  %[[LAB1:[0-9]+]] = load i32, ptr %f.addr, align 4
+// MCDC-DAG:  %[[BOOL:tobool[0-9]*]] = icmp ne i32 %[[LAB1]], 0
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC-DAG:  %[[LAB2:[0-9]+]] = zext i1 %[[BOOL]] to i32
+// MCDC-DAG:  %[[LAB3:[0-9]+]] = shl i32 %[[LAB2]], 5
+// MCDC-DAG:  %[[LAB4:[0-9]+]] = or i32 %[[TEMP]], %[[LAB3]]
+// MCDC-DAG:  store i32 %[[LAB4]], ptr %mcdc.addr, align 4
+
+// UPDATE FINAL BITMASK WITH RESULT.
+// NOPROFPASS-LABEL: lor.end:
+// NOPROFPASS: call void @llvm.instrprof.mcdc.tvbitmap.update(ptr @__profn_test, i64 [[HASH]], i32 8, i32 0, ptr %mcdc.addr)
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
+// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDC:  %mcdc.bits = load i8, ptr %[[LAB4]], align 1
+// MCDC:  %[[LAB8:[0-9]+]] = or i8 %mcdc.bits, %[[LAB7]]
+// MCDC:  store i8 %[[LAB8]], ptr %[[LAB4]], align 1
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c b/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c
new file mode 100644
index 0000000000000..748af46ee52fa
--- /dev/null
+++ b/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c
@@ -0,0 +1,26 @@
+// REQUIRES: darwin
+
+// RUN: %clang_profgen -fcoverage-mapping -fcoverage-mcdc -O3 -o %t.exe %s
+// RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe 3 3
+// RUN: llvm-profdata show --text --all-functions %t.profraw | FileCheck %s
+
+// CHECK: Num Bitmap Bytes:
+// CHECK-NEXT: $1
+// CHECK-NEXT: Bitmap Byte Values:
+// CHECK-NEXT: 8
+#include <stdio.h>
+#include <stdlib.h>
+extern int __llvm_profile_is_continuous_mode_enabled(void);
+int main(int argc, char *const argv[]) {
+  if (!__llvm_profile_is_continuous_mode_enabled())
+    return 1;
+
+  if (argc < 3)
+    return 1;
+
+  if ((atoi(argv[1]) > 2) && (atoi(argv[2]) > 2)) {
+    printf("Decision Satisfied");
+  }
+
+  return 0;
+}

From 0b45c7722c7661063176f4dcd108ac4b6242fd34 Mon Sep 17 00:00:00 2001
From: madanial0 <118996571+madanial0@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:18:43 -0500
Subject: [PATCH 267/313] [Flang] make ppc unsupported for x86_64 test case
 (NFC) (#73903)

The test case is for x86_64 adding powerpc as unsupported.

Co-authored-by: Mark Danial <mark.danial@ibm.com>
---
 flang/test/Semantics/kinds04_q10.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Semantics/kinds04_q10.f90 b/flang/test/Semantics/kinds04_q10.f90
index fbde6ed4c8cc1..3da619d24deec 100644
--- a/flang/test/Semantics/kinds04_q10.f90
+++ b/flang/test/Semantics/kinds04_q10.f90
@@ -7,7 +7,7 @@
 !
 ! This test is for x86_64, where exponent-letter 'q' is for
 ! 10-byte extended precision
-! UNSUPPORTED: system-windows
+! UNSUPPORTED: system-windows, system-aix
 ! REQUIRES: x86-registered-target
 
 subroutine s(var)

From 05216544a34eaf7aabf45df5f64d1c6c3e4e06c6 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 4 Jan 2024 11:25:09 -0800
Subject: [PATCH 268/313] [llvm] Add support for running tests as root (#75285)

There are a few test that check access permissions, so they need to be
disabled when running the tests as root.

The most common use case for running tests as root is inside of a
container. GitHub Actions, for example, only supports running the root
user inside of containers, so this change is necessary in order to run
the tests inside of a container running in the GitHub Actions
environment.
---
 .../tools/llvm-ar/error-opening-permission.test    |  1 +
 llvm/test/tools/llvm-dwarfdump/X86/output.s        |  1 +
 llvm/test/tools/llvm-ifs/fail-file-write.test      |  1 +
 .../llvm-ranlib/error-opening-permission.test      |  1 +
 llvm/utils/lit/lit/llvm/config.py                  | 14 ++++++++++++++
 5 files changed, 18 insertions(+)

diff --git a/llvm/test/tools/llvm-ar/error-opening-permission.test b/llvm/test/tools/llvm-ar/error-opening-permission.test
index 4107bdfc044fe..b42f95329a3c7 100644
--- a/llvm/test/tools/llvm-ar/error-opening-permission.test
+++ b/llvm/test/tools/llvm-ar/error-opening-permission.test
@@ -1,6 +1,7 @@
 ## Unsupported on windows as marking files "unreadable"
 ## is non-trivial on windows.
 # UNSUPPORTED: system-windows
+# REQUIRES: non-root-user
 
 # RUN: rm -rf %t && mkdir -p %t
 # RUN: echo file1 > %t/1.txt
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/output.s b/llvm/test/tools/llvm-dwarfdump/X86/output.s
index 37132eb55ca55..e7c9234ed74cf 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/output.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/output.s
@@ -1,3 +1,4 @@
+# REQUIRES: non-root-user
 # RUN: rm -f %t1.txt %t2.txt %t3.txt
 # RUN: llvm-mc %S/brief.s -filetype obj -triple x86_64-apple-darwin -o %t.o
 
diff --git a/llvm/test/tools/llvm-ifs/fail-file-write.test b/llvm/test/tools/llvm-ifs/fail-file-write.test
index d5232070c1d03..f13500f226205 100644
--- a/llvm/test/tools/llvm-ifs/fail-file-write.test
+++ b/llvm/test/tools/llvm-ifs/fail-file-write.test
@@ -1,6 +1,7 @@
 ## Test failing to write output file on non-windows platforms.
 
 # UNSUPPORTED: system-windows
+# REQUIRES: non-root-user
 # RUN: rm -rf %t.TestDir
 # RUN: mkdir %t.TestDir
 # RUN: touch %t.TestDir/Output.TestFile
diff --git a/llvm/test/tools/llvm-ranlib/error-opening-permission.test b/llvm/test/tools/llvm-ranlib/error-opening-permission.test
index 1b1bb0def78d7..be56962112e6b 100644
--- a/llvm/test/tools/llvm-ranlib/error-opening-permission.test
+++ b/llvm/test/tools/llvm-ranlib/error-opening-permission.test
@@ -1,5 +1,6 @@
 ## Unsupported on windows as marking files "unreadable" is non-trivial on windows.
 # UNSUPPORTED: system-windows
+# REQUIRES: non-root-user
 
 # RUN: rm -rf %t && split-file %s %t && cd %t
 # RUN: yaml2obj 1.yaml -o 1.o
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
index 79094b839e772..8b0f2e6295ad1 100644
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -13,6 +13,17 @@
 lit_path_displayed = False
 
 
+def user_is_root():
+    # os.getuid() is not available on all platforms
+    try:
+        if os.getuid() == 0:
+            return True
+    except:
+        pass
+
+    return False
+
+
 class LLVMConfig(object):
     def __init__(self, lit_config, config):
         self.lit_config = lit_config
@@ -154,6 +165,9 @@ def __init__(self, lit_config, config):
             if re.match(r'^ppc64le.*-linux', target_triple):
                 features.add('target=powerpc64le-linux')
 
+        if not user_is_root():
+            features.add("non-root-user")
+
         use_gmalloc = lit_config.params.get("use_gmalloc", None)
         if lit.util.pythonize_bool(use_gmalloc):
             # Allow use of an explicit path for gmalloc library.

From 852596d804a1b07c01e30f2091acf9c8bf16bda4 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 4 Jan 2024 11:29:00 -0800
Subject: [PATCH 269/313] [BasicAA] Guess reasonable contexts for separate
 storage hints (#76770)

The definition of the pointer of the memory location being queried is
always one such context. Even this conservative guess can be better than
no guess at all in some cases.

Fixes #64666

Co-authored-by: David Goldblatt <davidgoldblatt@meta.com>
---
 llvm/include/llvm/Analysis/ValueTracking.h    |  9 +++-
 llvm/lib/Analysis/BasicAliasAnalysis.cpp      | 30 ++++++++++---
 llvm/lib/Analysis/ValueTracking.cpp           |  7 +--
 .../BasicAA/separate_storage-alias-sets.ll    | 43 +++++++++++++++++++
 4 files changed, 79 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Analysis/BasicAA/separate_storage-alias-sets.ll

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index baa16306ebf5d..7360edfce1f39 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -810,9 +810,14 @@ bool isAssumeLikeIntrinsic(const Instruction *I);
 
 /// Return true if it is valid to use the assumptions provided by an
 /// assume intrinsic, I, at the point in the control-flow identified by the
-/// context instruction, CxtI.
+/// context instruction, CxtI. By default, ephemeral values of the assumption
+/// are treated as an invalid context, to prevent the assumption from being used
+/// to optimize away its argument. If the caller can ensure that this won't
+/// happen, it can call with AllowEphemerals set to true to get more valid
+/// assumptions.
 bool isValidAssumeForContext(const Instruction *I, const Instruction *CxtI,
-                             const DominatorTree *DT = nullptr);
+                             const DominatorTree *DT = nullptr,
+                             bool AllowEphemerals = false);
 
 enum class OverflowResult {
   /// Always overflows in the direction of signed/unsigned min value.
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 9eb7e914687ce..a4a0846df7af1 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1543,7 +1543,7 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
           TLI, NullIsValidLocation)))
     return AliasResult::NoAlias;
 
-  if (CtxI && EnableSeparateStorageAnalysis) {
+  if (EnableSeparateStorageAnalysis) {
     for (AssumptionCache::ResultElem &Elem : AC.assumptionsFor(O1)) {
       if (!Elem || Elem.Index == AssumptionCache::ExprResultIdx)
         continue;
@@ -1559,10 +1559,30 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
         const Value *HintO1 = getUnderlyingObject(Hint1);
         const Value *HintO2 = getUnderlyingObject(Hint2);
 
-        if (((O1 == HintO1 && O2 == HintO2) ||
-             (O1 == HintO2 && O2 == HintO1)) &&
-            isValidAssumeForContext(Assume, CtxI, DT))
-          return AliasResult::NoAlias;
+        auto ValidAssumeForPtrContext = [&](const Value *Ptr) {
+          if (const Instruction *PtrI = dyn_cast<Instruction>(Ptr)) {
+            return isValidAssumeForContext(Assume, PtrI, DT,
+                                           /* AllowEphemerals */ true);
+          }
+          if (const Argument *PtrA = dyn_cast<Argument>(Ptr)) {
+            const Instruction *FirstI =
+                &*PtrA->getParent()->getEntryBlock().begin();
+            return isValidAssumeForContext(Assume, FirstI, DT,
+                                           /* AllowEphemerals */ true);
+          }
+          return false;
+        };
+
+        if ((O1 == HintO1 && O2 == HintO2) || (O1 == HintO2 && O2 == HintO1)) {
+          // Note that we go back to V1 and V2 for the
+          // ValidAssumeForPtrContext checks; they're dominated by O1 and O2,
+          // so strictly more assumptions are valid for them.
+          if ((CtxI && isValidAssumeForContext(Assume, CtxI, DT,
+                                               /* AllowEphemerals */ true)) ||
+              ValidAssumeForPtrContext(V1) || ValidAssumeForPtrContext(V2)) {
+            return AliasResult::NoAlias;
+          }
+        }
       }
     }
   }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 439127e5c9540..940ae9eb7ee29 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -485,7 +485,8 @@ bool llvm::isAssumeLikeIntrinsic(const Instruction *I) {
 
 bool llvm::isValidAssumeForContext(const Instruction *Inv,
                                    const Instruction *CxtI,
-                                   const DominatorTree *DT) {
+                                   const DominatorTree *DT,
+                                   bool AllowEphemerals) {
   // There are two restrictions on the use of an assume:
   //  1. The assume must dominate the context (or the control flow must
   //     reach the assume whenever it reaches the context).
@@ -503,7 +504,7 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
     // Don't let an assume affect itself - this would cause the problems
     // `isEphemeralValueOf` is trying to prevent, and it would also make
     // the loop below go out of bounds.
-    if (Inv == CxtI)
+    if (!AllowEphemerals && Inv == CxtI)
       return false;
 
     // The context comes first, but they're both in the same block.
@@ -516,7 +517,7 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
     if (!isGuaranteedToTransferExecutionToSuccessor(Range, 15))
       return false;
 
-    return !isEphemeralValueOf(Inv, CxtI);
+    return AllowEphemerals || !isEphemeralValueOf(Inv, CxtI);
   }
 
   // Inv and CxtI are in different blocks.
diff --git a/llvm/test/Analysis/BasicAA/separate_storage-alias-sets.ll b/llvm/test/Analysis/BasicAA/separate_storage-alias-sets.ll
new file mode 100644
index 0000000000000..7bd87d1b227b3
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/separate_storage-alias-sets.ll
@@ -0,0 +1,43 @@
+; We want BasicAA to make a reasonable conservative guess as to context for
+; separate storage hints. This lets alias analysis users (such as the alias set
+; tracker) who can't be context-sensitive still get the benefits of hints.
+
+; RUN: opt < %s -basic-aa-separate-storage -S -passes=print-alias-sets 2>&1 | FileCheck %s
+
+declare void @llvm.assume(i1)
+
+; CHECK-LABEL: Alias sets for function 'arg_arg'
+; CHECK: AliasSet[{{.*}}, 1] must alias, Ref Pointers: (ptr %a1, LocationSize::precise(1))
+; CHECK: AliasSet[{{.*}}, 1] must alias, Ref Pointers: (ptr %a2, LocationSize::precise(1))
+define void @arg_arg(ptr %a1, ptr %a2) {
+entry:
+  call void @llvm.assume(i1 true) [ "separate_storage"(ptr %a1, ptr %a2) ]
+  %0 = load i8, ptr %a1
+  %1 = load i8, ptr %a2
+  ret void
+}
+
+; CHECK-LABEL: Alias sets for function 'arg_inst'
+; CHECK: AliasSet[{{.*}}, 1] must alias, Ref Pointers: (ptr %a1, LocationSize::precise(1))
+; CHECK: AliasSet[{{.*}}, 1] must alias, Ref Pointers: (ptr %0, LocationSize::precise(1))
+define void @arg_inst(ptr %a1, ptr %a2) {
+entry:
+  %0 = getelementptr inbounds i8, ptr %a2, i64 20
+  call void @llvm.assume(i1 true) [ "separate_storage"(ptr %a1, ptr %0) ]
+  %1 = load i8, ptr %a1
+  %2 = load i8, ptr %0
+  ret void
+}
+
+; CHECK-LABEL: Alias sets for function 'inst_inst'
+; CHECK: AliasSet[{{.*}}, 1] must alias, Ref Pointers: (ptr %0, LocationSize::precise(1))
+; CHECK: AliasSet[{{.*}}, 1] must alias, Ref Pointers: (ptr %1, LocationSize::precise(1))
+define void @inst_inst(ptr %a1, ptr %a2) {
+entry:
+  %0 = getelementptr inbounds i8, ptr %a1, i64 20
+  %1 = getelementptr inbounds i8, ptr %a2, i64 20
+  call void @llvm.assume(i1 true) [ "separate_storage"(ptr %0, ptr %1) ]
+  %2 = load i8, ptr %0
+  %3 = load i8, ptr %1
+  ret void
+}

From 8f40783944fcdd33a2ad134be0d26a671202fd85 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Thu, 4 Jan 2024 16:30:49 -0300
Subject: [PATCH 270/313] [lldb][nfc] Mark function as const (#76974)

This function has no mutable behavior
---
 lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp | 2 +-
 lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
index 7c253553d57b4..b718f98340a70 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -48,7 +48,7 @@ DebugNamesDWARFIndex::GetUnits(const DebugNames &debug_names) {
 }
 
 std::optional<DIERef>
-DebugNamesDWARFIndex::ToDIERef(const DebugNames::Entry &entry) {
+DebugNamesDWARFIndex::ToDIERef(const DebugNames::Entry &entry) const {
   // Look for a DWARF unit offset (CU offset or local TU offset) as they are
   // both offsets into the .debug_info section.
   std::optional<uint64_t> unit_offset = entry.getCUOffset();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
index 7ce630a56137d..cca0913c4124c 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
@@ -79,7 +79,7 @@ class DebugNamesDWARFIndex : public DWARFIndex {
   std::unique_ptr<DebugNames> m_debug_names_up;
   ManualDWARFIndex m_fallback;
 
-  std::optional<DIERef> ToDIERef(const DebugNames::Entry &entry);
+  std::optional<DIERef> ToDIERef(const DebugNames::Entry &entry) const;
   bool ProcessEntry(const DebugNames::Entry &entry,
                     llvm::function_ref<bool(DWARFDIE die)> callback);
 

From 166bd4e1f18da221621953bd5943c1a8d17201fe Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 4 Jan 2024 11:30:57 -0800
Subject: [PATCH 271/313] [workflows] Build a container for running CI on
 github actions (#75286)

Using a container will allow us to have similar testing environments on
both the GitHub hosted runners and the self-hosted runners.
---
 .github/workflows/build-ci-container.yml      | 60 +++++++++++++++++++
 .../containers/github-action-ci/Dockerfile    | 48 +++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 .github/workflows/build-ci-container.yml
 create mode 100644 .github/workflows/containers/github-action-ci/Dockerfile

diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
new file mode 100644
index 0000000000000..ad3d50d4d578a
--- /dev/null
+++ b/.github/workflows/build-ci-container.yml
@@ -0,0 +1,60 @@
+
+name: Build CI Container
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/build-ci-container.yml
+      - '.github/workflows/containers/github-action-ci/**'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - .github/workflows/build-ci-container.yml
+      - '.github/workflows/containers/github-action-ci/**'
+
+jobs:
+  build-ci-container:
+    if: github.repository_owner == 'llvm'
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+    steps:
+      - name: Write Variables
+        id: vars
+        run: |
+          tag=`date +%s`
+          container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/ci-ubuntu-22.04"
+          echo "container-name=$container_name" >> $GITHUB_OUTPUT
+          echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT
+
+      - name: Checkout LLVM
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: .github/workflows/containers/github-action-ci/
+
+      - name: Build Container
+        working-directory: ./.github/workflows/containers/github-action-ci/
+        run: |
+          podman build -t ${{ steps.vars.outputs.container-name-tag }} .
+          podman tag ${{ steps.vars.outputs.container-name-tag }} ${{ steps.vars.outputs.container-name }}:latest
+
+      - name: Test Container
+        run: |
+          for image in ${{ steps.vars.outputs.container-name-tag }} ${{  steps.vars.outputs.container-name }}; do
+            podman run --rm -it $image /usr/bin/bash -x -c 'printf '\''#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
+          done
+
+      - name: Push Container
+        if: github.event_name == 'push'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
+          podman push ${{ steps.vars.outputs.container-name-tag }}
+          podman push ${{ steps.vars.outputs.container-name }}:latest
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
new file mode 100644
index 0000000000000..d91a7ad3a9d06
--- /dev/null
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -0,0 +1,48 @@
+FROM docker.io/library/ubuntu:22.04 as base
+ENV LLVM_SYSROOT=/opt/llvm/
+
+FROM base as toolchain
+ENV LLVM_MAJOR=17
+ENV LLVM_VERSION=${LLVM_MAJOR}.0.6
+ENV LLVM_DIRNAME=clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-22.04
+ENV LLVM_FILENAME=${LLVM_DIRNAME}.tar.xz
+
+RUN apt-get update && \
+    apt-get install -y \
+    curl \
+    xz-utils
+
+RUN mkdir -p $LLVM_SYSROOT/bin/ $LLVM_SYSROOT/lib/
+
+RUN curl -O -L https://github.com/llvm/llvm-project/releases/download/llvmorg-$LLVM_VERSION/$LLVM_FILENAME
+
+RUN tar -C $LLVM_SYSROOT --strip-components=1 -xJf $LLVM_FILENAME \
+    $LLVM_DIRNAME/bin/clang \
+    $LLVM_DIRNAME/bin/clang++ \
+    $LLVM_DIRNAME/bin/clang-cl \
+    $LLVM_DIRNAME/bin/clang-$LLVM_MAJOR \
+    $LLVM_DIRNAME/bin/lld \
+    $LLVM_DIRNAME/bin/ld.lld \
+    $LLVM_DIRNAME/lib/clang/
+
+
+FROM base
+
+COPY --from=toolchain $LLVM_SYSROOT $LLVM_SYSROOT
+
+# Need to install curl for hendrikmuhs/ccache-action
+# Need nodejs for some of the GitHub actions.
+# Need perl-modules for clang analyzer tests.
+RUN apt-get update && \
+    apt-get install -y \
+    binutils \
+    cmake \
+    curl \
+    libstdc++-11-dev \
+    ninja-build \
+    nodejs \
+    perl-modules \
+    python3-psutil
+
+ENV LLVM_SYSROOT=$LLVM_SYSROOT
+ENV PATH=${LLVM_SYSROOT}/bin:${PATH}

From 45651c357ae4e39a115bae6a936151b1741abf70 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 4 Jan 2024 11:56:32 -0800
Subject: [PATCH 272/313] [RISCV] Fix indentation in riscv_sifive_vector.td.
 NFC

---
 clang/include/clang/Basic/riscv_sifive_vector.td | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td
index 0d471f6c554c2..ef5114d6105e4 100644
--- a/clang/include/clang/Basic/riscv_sifive_vector.td
+++ b/clang/include/clang/Basic/riscv_sifive_vector.td
@@ -121,11 +121,11 @@ multiclass RVVVQMACCDODBuiltinSet<list<list<string>> suffixes_prototypes> {
 }
 
 multiclass RVVVQMACCQOQBuiltinSet<list<list<string>> suffixes_prototypes> {
-   let OverloadedName = NAME,
-       Name = NAME,
-       HasMasked = false,
-       Log2LMUL = [-1, 0, 1, 2] in
-     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "s", suffixes_prototypes>;
+  let OverloadedName = NAME,
+      Name = NAME,
+      HasMasked = false,
+      Log2LMUL = [-1, 0, 1, 2] in
+    defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "s", suffixes_prototypes>;
 }
 
 multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_range> {

From b5a3e9639291359d5c9e16f0610393d92bc7d4c2 Mon Sep 17 00:00:00 2001
From: ChipsSpectre <maxi.hornung@t-online.de>
Date: Thu, 4 Jan 2024 21:04:54 +0100
Subject: [PATCH 273/313] [Clang][Parser] Fix crash of clang when using C++
 constructs like :: in C code (#74926)

Ensure we do not try to parse a nested-name-specifier when parsing an ill-formed file in C mode.

Fixes #73559
---
 clang/docs/ReleaseNotes.rst      | 6 ++++--
 clang/lib/Parse/ParseDecl.cpp    | 3 ++-
 clang/lib/Parse/ParseDeclCXX.cpp | 2 ++
 clang/test/Parser/cxx-in-c.c     | 3 +++
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e5de042cebd4c..ce7599ad34bea 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -686,9 +686,11 @@ Bug Fixes in This Version
   (`#65568 <https://github.com/llvm/llvm-project/issues/65568>`_)
 - Fix an issue where clang doesn't respect detault template arguments that
   are added in a later redeclaration for CTAD.
-  Fixes (#69987 <https://github.com/llvm/llvm-project/issues/69987>`_)
+  Fixes (`#69987 <https://github.com/llvm/llvm-project/issues/69987>`_)
 - Fix an issue where CTAD fails for explicit type conversion.
-  Fixes (#64347 <https://github.com/llvm/llvm-project/issues/64347>`_)
+  Fixes (`#64347 <https://github.com/llvm/llvm-project/issues/64347>`_)
+- Fix crash when using C++ only tokens like ``::`` in C compiler clang.
+  Fixes (`#73559 <https://github.com/llvm/llvm-project/issues/73559>`_)
 
 
 Bug Fixes to Compiler Builtins
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index ed006f9d67de4..b60ae293ef8c2 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -3483,7 +3483,8 @@ void Parser::ParseDeclarationSpecifiers(
 
     case tok::coloncolon: // ::foo::bar
       // C++ scope specifier.  Annotate and loop, or bail out on error.
-      if (TryAnnotateCXXScopeToken(EnteringContext)) {
+      if (getLangOpts().CPlusPlus &&
+          TryAnnotateCXXScopeToken(EnteringContext)) {
         if (!DS.hasTypeSpecifier())
           DS.SetTypeSpecError();
         goto DoneWithDeclSpec;
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 910112ecae964..d97081da4200d 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -2679,6 +2679,8 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS,
                                        ParsedAttributes &AccessAttrs,
                                        const ParsedTemplateInfo &TemplateInfo,
                                        ParsingDeclRAIIObject *TemplateDiags) {
+  assert(getLangOpts().CPlusPlus &&
+         "ParseCXXClassMemberDeclaration should only be called in C++ mode");
   if (Tok.is(tok::at)) {
     if (getLangOpts().ObjC && NextToken().isObjCAtKeyword(tok::objc_defs))
       Diag(Tok, diag::err_at_defs_cxx);
diff --git a/clang/test/Parser/cxx-in-c.c b/clang/test/Parser/cxx-in-c.c
index f5fa39bd0cb99..034a44cdf12bf 100644
--- a/clang/test/Parser/cxx-in-c.c
+++ b/clang/test/Parser/cxx-in-c.c
@@ -3,3 +3,6 @@
 // PR9137
 void f0(int x) : {}; // expected-error{{expected function body after function declarator}}
 void f1(int x) try {}; // expected-error{{expected function body after function declarator}}
+
+// GH73559
+::; // expected-error{{expected identifier or '('}}

From 9d829784d4c2b4f82e0b301eaa840c9ed192e919 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray@arm.com>
Date: Thu, 4 Jan 2024 20:06:47 +0000
Subject: [PATCH 274/313] [AArch64] Correct features for Arm Cortex-A78C,
 Cortex-X1C and Cortex-X2 (#76932)

Remove AArch64::AEK_FP16ML from Arm Cortex-A78C definition, as
this is not supported, according to the Technical Reference Manual:
   https://developer.arm.com/documentation/102226/latest/

Also add AArch64::AEK_FLAGM (Flag Manipulation) to Arm Cortex-X1C
and Arm Cortex-X2 as these were missing previously, but are
supported, according to the Technical Reference Manuals:
   https://developer.arm.com/documentation/101968/latest/
   https://developer.arm.com/documentation/101803/latest/

Fixes #62383
---
 llvm/include/llvm/TargetParser/AArch64TargetParser.h | 7 +++----
 llvm/lib/Target/AArch64/AArch64.td                   | 2 +-
 llvm/unittests/TargetParser/TargetParserTest.cpp     | 6 +++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 53dc2be825f28..2fe4d5eeb742c 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -482,8 +482,7 @@ inline constexpr CpuInfo CpuInfos[] = {
      (AArch64::ExtensionBitset(
          {AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_FP16,
           AArch64::AEK_DOTPROD, AArch64::AEK_RCPC, AArch64::AEK_SSBS,
-          AArch64::AEK_PROFILE, AArch64::AEK_FLAGM, AArch64::AEK_PAUTH,
-          AArch64::AEK_FP16FML}))},
+          AArch64::AEK_PROFILE, AArch64::AEK_FLAGM, AArch64::AEK_PAUTH}))},
     {"cortex-a710", ARMV9A,
      (AArch64::ExtensionBitset(
          {AArch64::AEK_MTE, AArch64::AEK_PAUTH, AArch64::AEK_FLAGM,
@@ -514,13 +513,13 @@ inline constexpr CpuInfo CpuInfos[] = {
      (AArch64::ExtensionBitset(
          {AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_FP16,
           AArch64::AEK_DOTPROD, AArch64::AEK_RCPC, AArch64::AEK_SSBS,
-          AArch64::AEK_PAUTH, AArch64::AEK_PROFILE}))},
+          AArch64::AEK_PAUTH, AArch64::AEK_PROFILE, AArch64::AEK_FLAGM}))},
     {"cortex-x2", ARMV9A,
      (AArch64::ExtensionBitset(
          {AArch64::AEK_MTE, AArch64::AEK_BF16, AArch64::AEK_I8MM,
           AArch64::AEK_PAUTH, AArch64::AEK_SSBS, AArch64::AEK_SB,
           AArch64::AEK_SVE, AArch64::AEK_SVE2, AArch64::AEK_SVE2BITPERM,
-          AArch64::AEK_FP16FML}))},
+          AArch64::AEK_FP16FML, AArch64::AEK_FLAGM}))},
     {"cortex-x3", ARMV9A,
      (AArch64::ExtensionBitset(
          {AArch64::AEK_SVE, AArch64::AEK_PERFMON, AArch64::AEK_PROFILE,
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 68f452039c9b6..d5e8ed101d1cd 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1405,7 +1405,7 @@ def ProcessorFeatures {
                                  FeatureSSBS];
   list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureFullFP16, FeatureDotProd,
-                                 FeatureFlagM, FeatureFP16FML, FeaturePAuth,
+                                 FeatureFlagM, FeaturePAuth,
                                  FeaturePerfMon, FeatureRCPC, FeatureSPE,
                                  FeatureSSBS];
   list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 92bd4da1d3a47..6b2cbb33ba8d0 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1197,7 +1197,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RAS, AArch64::AEK_LSE, AArch64::AEK_RDM,
                  AArch64::AEK_FP16, AArch64::AEK_DOTPROD, AArch64::AEK_RCPC,
                  AArch64::AEK_SSBS, AArch64::AEK_PROFILE, AArch64::AEK_FLAGM,
-                 AArch64::AEK_PAUTH, AArch64::AEK_FP16FML})),
+                 AArch64::AEK_PAUTH})),
             "8.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a710", "armv9-a", "neon-fp-armv8",
@@ -1289,7 +1289,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_FP, AArch64::AEK_RDM, AArch64::AEK_SIMD,
                  AArch64::AEK_RAS, AArch64::AEK_LSE, AArch64::AEK_FP16,
                  AArch64::AEK_DOTPROD, AArch64::AEK_RCPC, AArch64::AEK_SSBS,
-                 AArch64::AEK_PAUTH, AArch64::AEK_PROFILE})),
+                 AArch64::AEK_PAUTH, AArch64::AEK_PROFILE, AArch64::AEK_FLAGM})),
             "8.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x2", "armv9-a", "neon-fp-armv8",
@@ -1300,7 +1300,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PAUTH, AArch64::AEK_I8MM, AArch64::AEK_BF16,
                  AArch64::AEK_SVE, AArch64::AEK_SVE2, AArch64::AEK_SVE2BITPERM,
                  AArch64::AEK_SSBS, AArch64::AEK_SB, AArch64::AEK_FP16,
-                 AArch64::AEK_FP16FML})),
+                 AArch64::AEK_FP16FML, AArch64::AEK_FLAGM})),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x3", "armv9-a", "neon-fp-armv8",

From cd3942059eed7b7185f26bc583ac287a995db0d0 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Thu, 4 Jan 2024 14:08:30 -0600
Subject: [PATCH 275/313] [SeperateConstOffsetFromGEP] Pre-commit tests for or
 disjoint handling (#76972)

1. Adds tests for the existing interpretation of `or` as `add` in
SeperateConstOffsetFromGEP.
2. Pre-commits a test for `or disjoint`.
---
 .../split-gep-or-as-add.ll                    | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll

diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
new file mode 100644
index 0000000000000..45154f5a68f92
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes="separate-const-offset-from-gep<lower-gep>" < %s | FileCheck %s
+
+;; Check that or operations, either with operands with no bits in common or that
+;; are disjoint are lowered into constant GEPs. Note that because this is a
+;; target-independent test, the GEP seperator will lower the seperated-off constant
+;; part to ptrtoint-based arithmetic.
+
+define void @testOrDoesntSplit(ptr %p) {
+; CHECK-LABEL: define void @testOrDoesntSplit(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[VAR:%.*]] = tail call i64 @foo()
+; CHECK-NEXT:    [[OFF:%.*]] = or i64 [[VAR]], 10
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFF]]
+; CHECK-NEXT:    store i8 0, ptr [[Q]], align 1
+; CHECK-NEXT:    ret void
+;
+  %var = tail call i64 @foo()
+  %off = or i64 %var, 10
+  %q = getelementptr i8, ptr %p, i64 %off
+  store i8 0, ptr %q
+  ret void
+}
+
+define void @testNoBitsInCommonOrSplits(ptr %p) {
+; CHECK-LABEL: define void @testNoBitsInCommonOrSplits(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[VAR:%.*]] = tail call i64 @foo()
+; CHECK-NEXT:    [[VAR_HIGH:%.*]] = and i64 [[VAR]], -16
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[VAR_HIGH]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 10
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    store i8 0, ptr [[TMP4]], align 1
+; CHECK-NEXT:    ret void
+;
+  %var = tail call i64 @foo()
+  %var.high = and i64 %var, -16
+  %off = or i64 %var.high, 10
+  %q = getelementptr i8, ptr %p, i64 %off
+  store i8 0, ptr %q
+  ret void
+}
+
+define void @testDisjointOrSplits(ptr %p) {
+; CHECK-LABEL: define void @testDisjointOrSplits(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[VAR:%.*]] = tail call i64 @foo()
+; CHECK-NEXT:    [[OFF:%.*]] = or disjoint i64 [[VAR]], 10
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFF]]
+; CHECK-NEXT:    store i8 0, ptr [[Q]], align 1
+; CHECK-NEXT:    ret void
+;
+  %var = tail call i64 @foo()
+  %off = or disjoint i64 %var, 10
+  %q = getelementptr i8, ptr %p, i64 %off
+  store i8 0, ptr %q
+  ret void
+}
+
+declare i64 @foo()

From c398923f32ff5373964e1288d9de37403c2399dd Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Thu, 4 Jan 2024 12:14:32 -0800
Subject: [PATCH 276/313] [CMake][runtimes] Check LLVM_ENABLE_PROJECTS for libc
 (#76845)

Only some targets may be building llvm-libc in which case the top-level
LLVM_ENABLE_RUNTIMES variable won't contain libc. Rather, we can check
LLVM_ENABLE_PROJECTS since libc is required to be included there for
libc-hdrgen to be built (and when LLVM_ENABLE_RUNTIMES contains libc, we
automatically include libc in LLVM_ENABLE_PROJECTS as well).
---
 llvm/runtimes/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index db50b6d1e8f8c..b9acb862cc5cb 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -423,7 +423,7 @@ if(runtimes)
       endif()
     endforeach()
   endif()
-  if("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND 
+  if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND
       (LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES))
     if(TARGET libc-hdrgen)
       set(libc_tools libc-hdrgen)

From 71c17424b5d5c22c0ce6b4c41acaa0401515baca Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Thu, 4 Jan 2024 21:33:51 +0100
Subject: [PATCH 277/313] [mlir][TD] update more tests to use the "main"
 interpreter pass (#76963)

Update several tests under mlir/test/Dialect/Transform to use the "main"
transform interpreter pass with named entry points rather than the test
interpreter pass.

This helped discover a logic error in the expensive checks mechanism
that was exiting too early.
---
 .../Transform/IR/TransformInterfaces.cpp      |    7 +-
 .../Transform/apply-foreach-nested.mlir       |   26 +-
 .../Dialect/Transform/expensive-checks.mlir   |  395 +--
 .../Transform/selective-targeting.mlir        |  104 +-
 .../Dialect/Transform/test-interpreter.mlir   | 2127 +++++++++--------
 5 files changed, 1464 insertions(+), 1195 deletions(-)

diff --git a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
index cd66a0e566f6c..371ad904dcae5 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
@@ -629,9 +629,6 @@ void transform::TransformState::recordOpHandleInvalidation(
   // the IR nested in each payload op associated with the given handle and look
   // for handles associated with each operation and value.
   for (const auto &[region, mapping] : llvm::reverse(mappings)) {
-    // Stop lookup when reaching a region that is isolated from above.
-    if (region->getParentOp()->hasTrait<OpTrait::IsIsolatedFromAbove>())
-      break;
     // Go over all op handle mappings and mark as invalidated any handle
     // pointing to any of the payload ops associated with the given handle or
     // any op nested in them.
@@ -652,6 +649,10 @@ void transform::TransformState::recordOpHandleInvalidation(
                                                    payloadValue, valueHandle,
                                                    newlyInvalidated);
     }
+
+    // Stop lookup when reaching a region that is isolated from above.
+    if (region->getParentOp()->hasTrait<OpTrait::IsIsolatedFromAbove>())
+      break;
   }
 }
 
diff --git a/mlir/test/Dialect/Transform/apply-foreach-nested.mlir b/mlir/test/Dialect/Transform/apply-foreach-nested.mlir
index d2f71644bbc51..23f45b97bc2f8 100644
--- a/mlir/test/Dialect/Transform/apply-foreach-nested.mlir
+++ b/mlir/test/Dialect/Transform/apply-foreach-nested.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --split-input-file --verify-diagnostics \
-// RUN:          --pass-pipeline="builtin.module(test-transform-dialect-interpreter{enable-expensive-checks=1 bind-first-extra-to-ops=scf.for})"
+// RUN:             --transform-interpreter
 
 func.func private @bar()
 
@@ -17,12 +17,15 @@ func.func @foo() {
   return
 }
 
-transform.sequence failures(suppress) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.op<"scf.for">):
-  %1 = transform.test_reverse_payload_ops %arg1 : (!transform.op<"scf.for">) -> !transform.op<"scf.for">
-  // expected-error @below {{transform operation consumes a handle pointing to an ancestor payload operation before its descendant}}
-  // expected-note @below {{the ancestor is likely erased or rewritten before the descendant is accessed, leading to undefined behavior}}
-  transform.test_consume_operand_each %1 : !transform.op<"scf.for">
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.op<"scf.for">
+    %1 = transform.test_reverse_payload_ops %0 : (!transform.op<"scf.for">) -> !transform.op<"scf.for">
+    // expected-error @below {{transform operation consumes a handle pointing to an ancestor payload operation before its descendant}}
+    // expected-note @below {{the ancestor is likely erased or rewritten before the descendant is accessed, leading to undefined behavior}}
+    transform.test_consume_operand_each %1 : !transform.op<"scf.for">
+    transform.yield
+  }
 }
 
 // -----
@@ -42,7 +45,10 @@ func.func @foo() {
 }
 
 // No error here, processing ancestors before descendants.
-transform.sequence failures(suppress) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.op<"scf.for">):
-  transform.test_consume_operand_each %arg1 : !transform.op<"scf.for">
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.op<"scf.for">
+    transform.test_consume_operand_each %0 : !transform.op<"scf.for">
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/expensive-checks.mlir b/mlir/test/Dialect/Transform/expensive-checks.mlir
index ee9a5af805524..b532e1892b1ad 100644
--- a/mlir/test/Dialect/Transform/expensive-checks.mlir
+++ b/mlir/test/Dialect/Transform/expensive-checks.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --test-transform-dialect-interpreter='enable-expensive-checks=1' --split-input-file --verify-diagnostics %s
+// RUN: mlir-opt --transform-interpreter --split-input-file --verify-diagnostics %s
 
 // expected-note @below {{ancestor payload op}}
 func.func @func() {
@@ -6,24 +6,29 @@ func.func @func() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @return : benefit(1) {
-    %0 = operands
-    %1 = types
-    %2 = operation "func.return"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    rewrite %2 with "transform.dialect"
-  }
-
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    // expected-note @below {{handle to invalidated ops}}
-    %0 = pdl_match @return in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    // expected-note @below {{invalidated by this transform op that consumes its operand #0}}
-    test_consume_operand %1 : !transform.any_op
-    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-    test_print_remark_at_operand %0, "remark" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @return : benefit(1) {
+        %0 = operands
+        %1 = types
+        %2 = operation "func.return"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        rewrite %2 with "transform.dialect"
+      }
+
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        // expected-note @below {{handle to invalidated ops}}
+        %0 = pdl_match @return in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+        // expected-note @below {{invalidated by this transform op that consumes its operand #0}}
+        test_consume_operand %1 : !transform.any_op
+        // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+        test_print_remark_at_operand %0, "remark" : !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -35,29 +40,34 @@ func.func @func1() {
 }
 func.func private @func2()
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @func : benefit(1) {
-    %0 = operands
-    %1 = types
-    %2 = operation "func.func"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    rewrite %2 with "transform.dialect"
-  }
-  pdl.pattern @return : benefit(1) {
-    %0 = operands
-    %1 = types
-    %2 = operation "func.return"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    rewrite %2 with "transform.dialect"
-  }
-
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @func in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = pdl_match @return in %arg1 : (!transform.any_op) -> !transform.any_op
-    %2 = replicate num(%0) %1 : !transform.any_op, !transform.any_op
-    // expected-error @below {{a handle passed as operand #0 and consumed by this operation points to a payload entity more than once}}
-    test_consume_operand %2 : !transform.any_op
-    test_print_remark_at_operand %0, "remark" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @func : benefit(1) {
+        %0 = operands
+        %1 = types
+        %2 = operation "func.func"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        rewrite %2 with "transform.dialect"
+      }
+      pdl.pattern @return : benefit(1) {
+        %0 = operands
+        %1 = types
+        %2 = operation "func.return"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        rewrite %2 with "transform.dialect"
+      }
+
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @func in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = pdl_match @return in %arg1 : (!transform.any_op) -> !transform.any_op
+        %2 = replicate num(%0) %1 : !transform.any_op, !transform.any_op
+        // expected-error @below {{a handle passed as operand #0 and consumed by this operation points to a payload entity more than once}}
+        test_consume_operand %2 : !transform.any_op
+        test_print_remark_at_operand %0, "remark" : !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -66,10 +76,9 @@ transform.with_pdl_patterns {
 
 // expected-note @below {{ancestor payload op}}
 // expected-note @below {{nested payload op}}
-module {
+module attributes {transform.with_named_sequence} {
 
-  transform.sequence failures(propagate) {
-  ^bb0(%0: !transform.any_op):
+  transform.named_sequence @__transform_main(%0: !transform.any_op) {
     %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
     // expected-note @below {{handle to invalidated ops}}
     %2 = transform.test_copy_payload %0 : (!transform.any_op) ->!transform.any_op
@@ -77,6 +86,7 @@ module {
     transform.test_consume_operand %1 : !transform.any_op
     // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
     transform.test_consume_operand %2 : !transform.any_op
+    transform.yield
   }
 }
 
@@ -84,10 +94,9 @@ module {
 
 // expected-note @below {{ancestor payload op}}
 // expected-note @below {{nested payload op}}
-module {
+module attributes {transform.with_named_sequence} {
 
-  transform.sequence failures(propagate) {
-  ^bb0(%0: !transform.any_op):
+  transform.named_sequence @__transform_main(%0: !transform.any_op) {
     %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
     // expected-note @below {{handle to invalidated ops}}
     %2 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
@@ -97,6 +106,7 @@ module {
     // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
     // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities}}
     transform.test_consume_operand %1, %2 : !transform.any_op, !transform.any_op
+    transform.yield
   }
 }
 
@@ -104,13 +114,13 @@ module {
 
 // Deduplication attribute allows "merge_handles" to take repeated operands.
 
-module {
+module attributes {transform.with_named_sequence} {
 
-  transform.sequence failures(propagate) {
-  ^bb0(%0: !transform.any_op):
+  transform.named_sequence @__transform_main(%0: !transform.any_op) {
     %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
     %2 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
     transform.merge_handles %1, %2 { deduplicate } : !transform.any_op
+    transform.yield
   }
 }
 // -----
@@ -118,16 +128,18 @@ module {
 // expected-note @below {{payload value}}
 %0 = "test.match_anchor"() : () -> (i32)
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %2 = transform.structured.match ops{["test.match_anchor"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %3 = test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated handle}}
-  %4 = test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates handles to the same values as associated with it}}
-  test_consume_operand %3 : !transform.any_value
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %4 : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %2 = transform.structured.match ops{["test.match_anchor"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %3 = transform.test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated handle}}
+    %4 = transform.test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates handles to the same values as associated with it}}
+    transform.test_consume_operand %3 : !transform.any_value
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %4 : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -137,15 +149,17 @@ transform.sequence failures(propagate) {
 // expected-note @below {{op defining the value as result #0}}
 %0 = "test.match_anchor"() : () -> (i32)
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %2 = transform.structured.match ops{["test.match_anchor"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{invalidated handle}}
-  %3 = test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  test_consume_operand %2 : !transform.any_op
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %3 : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %2 = transform.structured.match ops{["test.match_anchor"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{invalidated handle}}
+    %3 = transform.test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    transform.test_consume_operand %2 : !transform.any_op
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %3 : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -159,16 +173,18 @@ transform.sequence failures(propagate) {
   "test.region_terminator"() : () -> ()
 }) : () -> ()
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{invalidated handle}}
-  %3 = test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  test_consume_operand %1 : !transform.any_op
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %3 : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{invalidated handle}}
+    %3 = transform.test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    transform.test_consume_operand %1 : !transform.any_op
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %3 : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -182,16 +198,18 @@ transform.sequence failures(propagate) {
   "test.region_terminator"() : () -> ()
 }) : () -> ()
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{invalidated handle}}
-  %3 = test_produce_value_handle_to_argument_of_parent_block %2, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  test_consume_operand %1 : !transform.any_op
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %3 : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{invalidated handle}}
+    %3 = transform.test_produce_value_handle_to_argument_of_parent_block %2, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    transform.test_consume_operand %1 : !transform.any_op
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %3 : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -208,16 +226,18 @@ transform.sequence failures(propagate) {
   }): () -> ()
 }) : () -> ()
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{invalidated handle}}
-  %3 = test_produce_value_handle_to_argument_of_parent_block %2, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  test_consume_operand %1 : !transform.any_op
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %3 : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{invalidated handle}}
+    %3 = transform.test_produce_value_handle_to_argument_of_parent_block %2, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    transform.test_consume_operand %1 : !transform.any_op
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %3 : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -227,15 +247,17 @@ transform.sequence failures(propagate) {
 // expected-note @below {{consumed handle points to this payload value}}
 %0 = "test.match_anchor"() : () -> (i32)
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  // expected-note @below {{handle to invalidated ops}}
-  %2 = transform.structured.match ops{["test.match_anchor"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %3 = test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  test_consume_operand %3 : !transform.any_value
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %2 : !transform.any_op 
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-note @below {{handle to invalidated ops}}
+    %2 = transform.structured.match ops{["test.match_anchor"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %3 = transform.test_produce_value_handle_to_result %2, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    transform.test_consume_operand %3 : !transform.any_value
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %2 : !transform.any_op 
+    transform.yield
+  }
 }
 
 // -----
@@ -249,16 +271,18 @@ transform.sequence failures(propagate) {
   "test.region_terminator"() : () -> ()
 }) : () -> (i32)
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{handle to invalidated ops}}
-  %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %3 = test_produce_value_handle_to_result %1, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  test_consume_operand %3 : !transform.any_value
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %2 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{handle to invalidated ops}}
+    %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %3 = transform.test_produce_value_handle_to_result %1, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    transform.test_consume_operand %3 : !transform.any_value
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %2 : !transform.any_op
+    transform.yield
+  }
 }
 
 
@@ -273,15 +297,17 @@ transform.sequence failures(propagate) {
   "test.region_terminator"() : () -> ()
 }) : () -> ()
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  // expected-note @below {{handle to invalidated ops}}
-  %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %3 = test_produce_value_handle_to_argument_of_parent_block %2, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  test_consume_operand %3 : !transform.any_value
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %2 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-note @below {{handle to invalidated ops}}
+    %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %3 = transform.test_produce_value_handle_to_argument_of_parent_block %2, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    transform.test_consume_operand %3 : !transform.any_value
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %2 : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -299,16 +325,18 @@ transform.sequence failures(propagate) {
   "test.match_anchor_1"() : () -> ()
 }) : () -> ()
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{handle to invalidated ops}}
-  %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %3 = test_produce_value_handle_to_argument_of_parent_block %1, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  test_consume_operand %3 : !transform.any_value
-  // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-  test_consume_operand %2 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{handle to invalidated ops}}
+    %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %3 = transform.test_produce_value_handle_to_argument_of_parent_block %1, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    transform.test_consume_operand %3 : !transform.any_value
+    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+    transform.test_consume_operand %2 : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -323,24 +351,28 @@ transform.sequence failures(propagate) {
   "test.match_anchor_2"() : () -> ()
 }) : () -> ()
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %3 = test_produce_value_handle_to_argument_of_parent_block %1, 0 : (!transform.any_op) -> !transform.any_value
-  test_consume_operand %3 : !transform.any_value
-  test_consume_operand %2 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %1 = transform.structured.match ops{["test.match_anchor_1"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.match ops{["test.match_anchor_2"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %3 = transform.test_produce_value_handle_to_argument_of_parent_block %1, 0 : (!transform.any_op) -> !transform.any_value
+    transform.test_consume_operand %3 : !transform.any_value
+    transform.test_consume_operand %2 : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_empty_payload : !transform.any_op
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0}}
-  transform.test_consume_operand %0 : !transform.any_op
-  // expected-error @below {{uses a handle associated with empty payload and invalidated by a previously executed transform op}}
-  transform.test_print_remark_at_operand %0, "remark" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_produce_empty_payload : !transform.any_op
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0}}
+    transform.test_consume_operand %0 : !transform.any_op
+    // expected-error @below {{uses a handle associated with empty payload and invalidated by a previously executed transform op}}
+    transform.test_print_remark_at_operand %0, "remark" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -352,9 +384,8 @@ transform.sequence failures(propagate) {
 // invalidate the handle to the root module thus invalidating all other handles.
 
 // expected-note @below {{ancestor payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence}  {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-note @below {{handle to invalidated ops}}
     // expected-note @below {{nested payload op}}
     %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
@@ -362,6 +393,7 @@ module {
     transform.test_consume_operand %arg0 : !transform.any_op
     // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
     transform.test_consume_operand %0 { allow_repeated_handles } : !transform.any_op
+    transform.yield
   }
 }
 
@@ -370,11 +402,13 @@ module {
 // Re-entering the region should not trigger the consumption error from previous
 // execution of the region.
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.test_re_enter_region {
-    %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
-    transform.test_consume_operand %0 : !transform.any_op
+module attributes {transform.with_named_sequence}  {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_re_enter_region {
+      %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+      transform.test_consume_operand %0 : !transform.any_op
+      transform.yield
+    }
     transform.yield
   }
 }
@@ -384,12 +418,14 @@ transform.sequence failures(propagate) {
 // Re-entering the region should not trigger the consumption error from previous
 // execution of the region.
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
-  transform.test_re_enter_region %0 : !transform.any_op {
-  ^bb0(%arg1: !transform.any_op):
-    transform.test_consume_operand %arg1 : !transform.any_op
+module attributes {transform.with_named_sequence}  {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+    transform.test_re_enter_region %0 : !transform.any_op {
+    ^bb0(%arg1: !transform.any_op):
+      transform.test_consume_operand %arg1 : !transform.any_op
+      transform.yield
+    }
     transform.yield
   }
 }
@@ -397,16 +433,17 @@ transform.sequence failures(propagate) {
 // -----
 
 // Consuming the same handle repeatedly in the region should trigger an error.
-
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-note @below {{payload op}}
-  // expected-note @below {{handle to invalidated ops}}
-  %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
-  transform.test_re_enter_region {
-    // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-    // expected-note @below {{invalidated by this transform op}}
-    transform.test_consume_operand %0 : !transform.any_op
+module attributes {transform.with_named_sequence}  {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-note @below {{payload op}}
+    // expected-note @below {{handle to invalidated ops}}
+    %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+    transform.test_re_enter_region {
+      // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
+      // expected-note @below {{invalidated by this transform op}}
+      transform.test_consume_operand %0 : !transform.any_op
+      transform.yield
+    }
     transform.yield
   }
 }
@@ -424,8 +461,8 @@ module @named_inclusion_and_consumption attributes { transform.with_named_sequen
     transform.yield
   }
 
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    include @foo failures(propagate) (%arg0) : (!transform.any_op) -> ()
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.include @foo failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/selective-targeting.mlir b/mlir/test/Dialect/Transform/selective-targeting.mlir
index 41a0d5091ec20..e88104315649a 100644
--- a/mlir/test/Dialect/Transform/selective-targeting.mlir
+++ b/mlir/test/Dialect/Transform/selective-targeting.mlir
@@ -1,4 +1,4 @@
-// RUN:  mlir-opt %s -test-transform-dialect-interpreter --split-input-file | FileCheck %s
+// RUN:  mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
 
 // CHECK-LABEL: func.func @matmul_tensors_1(
 func.func @matmul_tensors_1(
@@ -52,35 +52,40 @@ func.func @matmul_tensors_3(
   func.return %0 : tensor<128x128xf32>
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  // Match matmul operations inside @matmul_tensors with test.attrA set.
-  pdl.pattern @pdl_target_attrA : benefit(1) {
-    %args = operands
-    %results = types
-    %attr = attribute
-    %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrA" = %attr}-> (%results : !pdl.range<type>)
-    // TODO: we don't want this, but it is the required terminator for pdl.pattern
-    rewrite %0 with "transform.dialect"
-  }
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root : !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      // Match matmul operations inside @matmul_tensors with test.attrA set.
+      pdl.pattern @pdl_target_attrA : benefit(1) {
+        %args = operands
+        %results = types
+        %attr = attribute
+        %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrA" = %attr}-> (%results : !pdl.range<type>)
+        // TODO: we don't want this, but it is the required terminator for pdl.pattern
+        rewrite %0 with "transform.dialect"
+      }
 
-  // Match matmul operations inside @matmul_tensors with test.attrC set.
-  pdl.pattern @pdl_target_attrC : benefit(1) {
-    %args = operands
-    %results = types
-    %attr = attribute
-    %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrC" = %attr}-> (%results : !pdl.range<type>)
-    // TODO: we don't want this, but it is the required terminator for pdl.pattern
-    rewrite %0 with "transform.dialect"
-  }
+      // Match matmul operations inside @matmul_tensors with test.attrC set.
+      pdl.pattern @pdl_target_attrC : benefit(1) {
+        %args = operands
+        %results = types
+        %attr = attribute
+        %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrC" = %attr}-> (%results : !pdl.range<type>)
+        // TODO: we don't want this, but it is the required terminator for pdl.pattern
+        rewrite %0 with "transform.dialect"
+      }
 
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @pdl_target_attrA in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.tile_using_for %0 [4, 4, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-    %1 = pdl_match @pdl_target_attrC in %arg1 : (!transform.any_op) -> !transform.any_op
-    %2 = get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize_children_and_apply_patterns %2 : (!transform.any_op) -> !transform.any_op
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @pdl_target_attrA in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.structured.tile_using_for %0 [4, 4, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+        %1 = pdl_match @pdl_target_attrC in %arg1 : (!transform.any_op) -> !transform.any_op
+        %2 = get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+        transform.structured.vectorize_children_and_apply_patterns %2 : (!transform.any_op) -> !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -110,22 +115,27 @@ func.func @vectorize_none(
   func.return %0 : tensor<128x128xf32>
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @pdl_target : benefit(1) {
-    %args = operands
-    %results = types
-    %attr = attribute
-    %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrA" = %attr}-> (%results : !pdl.range<type>)
-    // TODO: we don't want this, but it is the required terminator for pdl.pattern
-    rewrite %0 with "transform.dialect"
-  }
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root : !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @pdl_target : benefit(1) {
+        %args = operands
+        %results = types
+        %attr = attribute
+        %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrA" = %attr}-> (%results : !pdl.range<type>)
+        // TODO: we don't want this, but it is the required terminator for pdl.pattern
+        rewrite %0 with "transform.dialect"
+      }
 
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @pdl_target in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @pdl_target in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+        transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -148,7 +158,9 @@ func.func @vectorize_all(
   return %1 : tensor<128x128xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.structured.vectorize_children_and_apply_patterns %arg0 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.structured.vectorize_children_and_apply_patterns %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir
index a39e6f94cb34f..3bbf875ef309e 100644
--- a/mlir/test/Dialect/Transform/test-interpreter.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter.mlir
@@ -1,29 +1,35 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
 
 // UNSUPPORTED: target=aarch64-pc-windows-msvc
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-remark @below {{applying transformation}}
-  transform.test_transform_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-remark @below {{applying transformation}}
+    transform.test_transform_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
-  // expected-remark @below {{succeeded}}
-  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
+    // expected-remark @below {{succeeded}}
+    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
-  // expected-error @below {{expected the operand to be associated a payload op of kind transform.sequence got transform.test_produce_self_handle_or_forward_operand}}
-  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.sequence" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
+    // expected-error @below {{expected the operand to be associated a payload op of kind transform.sequence got transform.test_produce_self_handle_or_forward_operand}}
+    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.sequence" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -31,56 +37,64 @@ transform.sequence failures(propagate) {
 // It is okay to have multiple handles to the same payload op as long
 // as only one of them is consumed. The expensive checks mode is necessary
 // to detect double-consumption.
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
-  %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
-  // expected-remark @below {{succeeded}}
-  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
+    %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
+    // expected-remark @below {{succeeded}}
+    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    // expected-remark @below {{applying transformation "a"}}
-    test_transform_op "a"
-    // expected-remark @below {{applying transformation "b"}}
-    test_transform_op "b"
-    // expected-remark @below {{applying transformation "c"}}
-    test_transform_op "c"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.sequence %arg0 : !transform.any_op failures(propagate) {
+    ^bb0(%arg1: !transform.any_op):
+      // expected-remark @below {{applying transformation "a"}}
+      test_transform_op "a"
+      // expected-remark @below {{applying transformation "b"}}
+      test_transform_op "b"
+      // expected-remark @below {{applying transformation "c"}}
+      test_transform_op "c"
+    }
+    // expected-remark @below {{applying transformation "d"}}
+    transform.test_transform_op "d"
+    // expected-remark @below {{applying transformation "e"}}
+    transform.test_transform_op "e"
+    transform.yield
   }
-  // expected-remark @below {{applying transformation "d"}}
-  test_transform_op "d"
-  // expected-remark @below {{applying transformation "e"}}
-  test_transform_op "e"
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
-  sequence %0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    // expected-remark @below {{succeeded}}
-    test_consume_operand_of_op_kind_or_fail %arg1, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+    transform.sequence %0 : !transform.any_op failures(propagate) {
+    ^bb0(%arg1: !transform.any_op):
+      // expected-remark @below {{succeeded}}
+      test_consume_operand_of_op_kind_or_fail %arg1, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+    }
+    transform.yield
   }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = sequence %arg0 : !transform.any_op -> !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %1 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
-    yield %1 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.sequence %arg0 : !transform.any_op -> !transform.any_op failures(propagate) {
+    ^bb0(%arg1: !transform.any_op):
+      %1 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+      yield %1 : !transform.any_op
+    }
+    // expected-remark @below {{succeeded}}
+    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+    transform.yield
   }
-  // expected-remark @below {{succeeded}}
-  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
 }
 
 // -----
@@ -98,19 +112,24 @@ func.func @bar() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @const : benefit(1) {
-    %r = pdl.types
-    %0 = pdl.operation "arith.constant" -> (%r : !pdl.range<type>)
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %f = pdl_match @const in %arg1 : (!transform.any_op) -> !transform.any_op
-    %m = get_parent_op %f {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    test_print_remark_at_operand %m, "parent function" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @const : benefit(1) {
+        %r = pdl.types
+        %0 = pdl.operation "arith.constant" -> (%r : !pdl.range<type>)
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %f = pdl_match @const in %arg1 : (!transform.any_op) -> !transform.any_op
+        %m = get_parent_op %f {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+        test_print_remark_at_operand %m, "parent function" : !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -130,14 +149,15 @@ func.func @test_get_nth_parent() {
   }) : () -> ()
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %f = transform.structured.match ops{["test.bar"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %parent = get_parent_op %f {nth_parent = 1, op_name = "test.foo"} : (!transform.any_op) -> !transform.any_op
-  test_print_remark_at_operand %parent, "1st parent" : !transform.any_op
-  %parent2 = get_parent_op %f {nth_parent = 2, op_name = "test.foo"} : (!transform.any_op) -> !transform.any_op
-  test_print_remark_at_operand %parent2, "2nd parent" : !transform.any_op
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %f = transform.structured.match ops{["test.bar"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %parent = transform.get_parent_op %f {nth_parent = 1, op_name = "test.foo"} : (!transform.any_op) -> !transform.any_op
+    transform.test_print_remark_at_operand %parent, "1st parent" : !transform.any_op
+    %parent2 = transform.get_parent_op %f {nth_parent = 2, op_name = "test.foo"} : (!transform.any_op) -> !transform.any_op
+    transform.test_print_remark_at_operand %parent2, "2nd parent" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -147,32 +167,37 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @match_func : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "func.func"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    // This is necessary to run the transformation on something other than the
-    // top-level module, "alternatives" cannot be run on that.
-    %0 = pdl_match @match_func in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.alternatives %0 : !transform.any_op {
-    ^bb2(%arg2: !transform.any_op):
-      %1 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
-      // This operation fails, which triggers the next alternative without
-      // reporting the error.
-      transform.test_consume_operand_of_op_kind_or_fail %1, "transform.sequence" : !transform.any_op
-    }, {
-    ^bb2(%arg2: !transform.any_op):
-      %1 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
-      // expected-remark @below {{succeeded}}
-      transform.test_consume_operand_of_op_kind_or_fail %1, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @match_func : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "func.func"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        // This is necessary to run the transformation on something other than the
+        // top-level module, "alternatives" cannot be run on that.
+        %0 = pdl_match @match_func in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.alternatives %0 : !transform.any_op {
+        ^bb2(%arg2: !transform.any_op):
+          %1 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+          // This operation fails, which triggers the next alternative without
+          // reporting the error.
+          transform.test_consume_operand_of_op_kind_or_fail %1, "transform.sequence" : !transform.any_op
+        }, {
+        ^bb2(%arg2: !transform.any_op):
+          %1 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+          // expected-remark @below {{succeeded}}
+          transform.test_consume_operand_of_op_kind_or_fail %1, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+        }
+      }
     }
+    transform.yield
   }
 }
 
@@ -185,26 +210,31 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @match_call : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "func.call"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{all alternatives failed}}
-    transform.alternatives %1 : !transform.any_op {
-    ^bb2(%arg2: !transform.any_op):
-      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
-      // expected-remark @below {{applying}}
-      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @match_call : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "func.call"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+        // expected-error @below {{all alternatives failed}}
+        transform.alternatives %1 : !transform.any_op {
+        ^bb2(%arg2: !transform.any_op):
+          %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+          // expected-remark @below {{applying}}
+          transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
+        }
+      }
     }
+    transform.yield
   }
 }
 
@@ -218,35 +248,40 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @match_call : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "func.call"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    transform.alternatives %1 : !transform.any_op {
-    ^bb2(%arg2: !transform.any_op):
-      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
-      // expected-remark @below {{applying}}
-      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
-    }, {
-    ^bb2(%arg2: !transform.any_op):
-      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
-      transform.test_print_remark_at_operand %2, "still here" : !transform.any_op
-      // This alternative succeeds.
-    }, {
-    ^bb2(%arg2: !transform.any_op):
-      // This alternative is never run, so we must not have a remark here.
-      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
-      transform.test_emit_remark_and_erase_operand %2, "should not happen" {fail_after_erase} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @match_call : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "func.call"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+        transform.alternatives %1 : !transform.any_op {
+        ^bb2(%arg2: !transform.any_op):
+          %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+          // expected-remark @below {{applying}}
+          transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
+        }, {
+        ^bb2(%arg2: !transform.any_op):
+          %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+          transform.test_print_remark_at_operand %2, "still here" : !transform.any_op
+          // This alternative succeeds.
+        }, {
+        ^bb2(%arg2: !transform.any_op):
+          // This alternative is never run, so we must not have a remark here.
+          %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+          transform.test_emit_remark_and_erase_operand %2, "should not happen" {fail_after_erase} : !transform.any_op
+        }
+      }
     }
+    transform.yield
   }
 }
 
@@ -259,30 +294,35 @@ func.func @erase_call() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @match_call : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "func.call"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    transform.alternatives %1 : !transform.any_op {
-    ^bb2(%arg2: !transform.any_op):
-      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
-      // expected-remark @below {{applying}}
-      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
-    }, {
-    ^bb2(%arg2: !transform.any_op):
-      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
-      // expected-remark @below {{applying second time}}
-      transform.test_emit_remark_and_erase_operand %2, "applying second time" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @match_call : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "func.call"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+        transform.alternatives %1 : !transform.any_op {
+        ^bb2(%arg2: !transform.any_op):
+          %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+          // expected-remark @below {{applying}}
+          transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
+        }, {
+        ^bb2(%arg2: !transform.any_op):
+          %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+          // expected-remark @below {{applying second time}}
+          transform.test_emit_remark_and_erase_operand %2, "applying second time" : !transform.any_op
+        }
+      }
     }
+    transform.yield
   }
 }
 
@@ -295,43 +335,48 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @match_call : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "func.call"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.alternatives %1 : !transform.any_op -> !transform.any_op {
-    ^bb2(%arg2: !transform.any_op):
-      %3 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
-      // expected-remark @below {{applying}}
-      transform.test_emit_remark_and_erase_operand %3, "applying" {fail_after_erase} : !transform.any_op
-      %4 = transform.test_produce_self_handle_or_forward_operand %3 : (!transform.any_op) -> !transform.any_op
-      transform.yield %4 : !transform.any_op
-    }, {
-    ^bb2(%arg2: !transform.any_op):
-      %4 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
-      transform.yield %4 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @match_call : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "func.call"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+        %2 = transform.alternatives %1 : !transform.any_op -> !transform.any_op {
+        ^bb2(%arg2: !transform.any_op):
+          %3 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+          // expected-remark @below {{applying}}
+          transform.test_emit_remark_and_erase_operand %3, "applying" {fail_after_erase} : !transform.any_op
+          %4 = transform.test_produce_self_handle_or_forward_operand %3 : (!transform.any_op) -> !transform.any_op
+          transform.yield %4 : !transform.any_op
+        }, {
+        ^bb2(%arg2: !transform.any_op):
+          %4 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+          transform.yield %4 : !transform.any_op
+        }
+        // The first alternative failed, so the returned value is taken from the
+        // second alternative, associated test_produce_self_handle_or_forward_operand rather
+        // than pdl_match.
+        // expected-remark @below {{succeeded}}
+        transform.test_consume_operand_of_op_kind_or_fail %2, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+      }
     }
-    // The first alternative failed, so the returned value is taken from the
-    // second alternative, associated test_produce_self_handle_or_forward_operand rather
-    // than pdl_match.
-    // expected-remark @below {{succeeded}}
-    transform.test_consume_operand_of_op_kind_or_fail %2, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{scope}}
-module {
+module attributes {transform.with_named_sequence} {
   func.func @foo() {
     %0 = arith.constant 0 : i32
     return
@@ -343,8 +388,7 @@ module {
     return
   }
 
-  transform.sequence failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     // expected-error @below {{scope must not contain the transforms being applied}}
     transform.alternatives %arg1 : !transform.any_op {
     ^bb2(%arg2: !transform.any_op):
@@ -355,6 +399,7 @@ module {
       %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
       transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
     }
+    transform.yield
   }
 }
 
@@ -368,24 +413,29 @@ func.func @foo(%arg0: index, %arg1: index, %arg2: index) {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @match_const : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "arith.constant"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = transform.pdl_match @match_const in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{only isolated-from-above ops can be alternative scopes}}
-    alternatives %1 : !transform.any_op {
-    ^bb2(%arg2: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @match_const : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "arith.constant"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = transform.pdl_match @match_const in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op
+        // expected-error @below {{only isolated-from-above ops can be alternative scopes}}
+        alternatives %1 : !transform.any_op {
+        ^bb2(%arg2: !transform.any_op):
+        }
+      }
     }
+    transform.yield
   }
 }
 // -----
@@ -396,21 +446,26 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{application of transform.test_wrong_number_of_results expected to produce 3 results (actually produced 1).}}
-    // expected-note @below {{if you need variadic results, consider a generic `apply` instead of the specialized `applyToOne`.}}
-    transform.test_wrong_number_of_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        // expected-error @below {{application of transform.test_wrong_number_of_results expected to produce 3 results (actually produced 1).}}
+        // expected-note @below {{if you need variadic results, consider a generic `apply` instead of the specialized `applyToOne`.}}
+        transform.test_wrong_number_of_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      }
+    }
+    transform.yield
   }
 }
 
@@ -423,21 +478,26 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{application of transform.test_wrong_number_of_multi_results expected to produce 1 results (actually produced 0)}}
-    // expected-note @below {{if you need variadic results, consider a generic `apply` instead of the specialized `applyToOne`.}}
-    transform.test_wrong_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        // expected-error @below {{application of transform.test_wrong_number_of_multi_results expected to produce 1 results (actually produced 0)}}
+        // expected-note @below {{if you need variadic results, consider a generic `apply` instead of the specialized `applyToOne`.}}
+        transform.test_wrong_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op)
+      }
+    }
+    transform.yield
   }
 }
 
@@ -450,20 +510,25 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    // Transform matches 3 ops and produces 2 results.
-    %1:2 = transform.test_correct_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        // Transform matches 3 ops and produces 2 results.
+        %1:2 = transform.test_correct_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      }
+    }
+    transform.yield
   }
 }
 
@@ -474,20 +539,25 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    // Transform fails to match any but still produces 2 results.
-    %1:2 = transform.test_correct_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        // Transform fails to match any but still produces 2 results.
+        %1:2 = transform.test_correct_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      }
+    }
+    transform.yield
   }
 }
 
@@ -500,19 +570,24 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.test_mixed_null_and_non_null_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.test_mixed_null_and_non_null_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      }
+    }
+    transform.yield
   }
 }
 
@@ -530,27 +605,32 @@ func.func @foo(%arg0: index) {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @addi : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "arith.addi"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-  pdl.pattern @subi : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "arith.subi"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = pdl_match @subi in %arg1 : (!transform.any_op) -> !transform.any_op
-    %2 = merge_handles %0, %1 : !transform.any_op
-    test_print_remark_at_operand %2, "matched" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @addi : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "arith.addi"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+      pdl.pattern @subi : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "arith.subi"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = pdl_match @subi in %arg1 : (!transform.any_op) -> !transform.any_op
+        %2 = merge_handles %0, %1 : !transform.any_op
+        test_print_remark_at_operand %2, "matched" : !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -561,23 +641,28 @@ func.func @foo(%arg0: index) {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @addi : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "arith.addi"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
-    %2 = merge_handles deduplicate %0, %1 : !transform.any_op
-    %3 = num_associations %2 : (!transform.any_op) -> !transform.param<i64>
-    // expected-remark @below {{1}}
-    test_print_param %3 : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @addi : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "arith.addi"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
+        %2 = merge_handles deduplicate %0, %1 : !transform.any_op
+        %3 = num_associations %2 : (!transform.any_op) -> !transform.param<i64>
+        // expected-remark @below {{1}}
+        test_print_param %3 : !transform.param<i64>
+      }
+    }
+    transform.yield
   }
 }
 
@@ -590,20 +675,25 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{failed to apply}}
-    transform.test_mixed_success_and_silenceable %0 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        // expected-error @below {{failed to apply}}
+        transform.test_mixed_success_and_silenceable %0 : !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -614,21 +704,26 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(suppress) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    // Not expecting error here because we are suppressing it.
-    // expected-remark @below {{foo}}
-    test_emit_remark_and_erase_operand %0, "foo" {fail_after_erase} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(suppress) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        // Not expecting error here because we are suppressing it.
+        // expected-remark @below {{foo}}
+        test_emit_remark_and_erase_operand %0, "foo" {fail_after_erase} : !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -639,52 +734,61 @@ func.func @foo() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operands
-    %1 = pdl.types
-    %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-    pdl.rewrite %2 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{silenceable error}}
-    // expected-remark @below {{foo}}
-    test_emit_remark_and_erase_operand %0, "foo" {fail_after_erase} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "op"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        // expected-error @below {{silenceable error}}
+        // expected-remark @below {{foo}}
+        test_emit_remark_and_erase_operand %0, "foo" {fail_after_erase} : !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
 
 // -----
 
-module {
+module attributes {transform.with_named_sequence} {
   func.func private @foo()
   func.func private @bar()
 
-  transform.with_pdl_patterns {
-  ^bb0(%arg0: !transform.any_op):
-    pdl.pattern @func : benefit(1) {
-      %0 = pdl.operands
-      %1 = pdl.types
-      %2 = pdl.operation "func.func"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
-      pdl.rewrite %2 with "transform.dialect"
-    }
-
-    transform.sequence %arg0 : !transform.any_op failures(propagate) {
-    ^bb0(%arg1: !transform.any_op):
-      %0 = pdl_match @func in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1 = replicate num(%0) %arg1 : !transform.any_op, !transform.any_op
-      %p = num_associations %1 : (!transform.any_op) -> !transform.param<i64>
-      // expected-remark @below {{2}}
-      test_print_param %p : !transform.param<i64>
-      %2 = replicate num(%0) %1 : !transform.any_op, !transform.any_op
-      %p2 = num_associations %2 : (!transform.any_op) -> !transform.param<i64>
-      // expected-remark @below {{4}}
-      test_print_param %p2 : !transform.param<i64>
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @func : benefit(1) {
+        %0 = pdl.operands
+        %1 = pdl.types
+        %2 = pdl.operation "func.func"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
+        pdl.rewrite %2 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @func in %arg1 : (!transform.any_op) -> !transform.any_op
+        %1 = replicate num(%0) %arg1 : !transform.any_op, !transform.any_op
+        %p = num_associations %1 : (!transform.any_op) -> !transform.param<i64>
+        // expected-remark @below {{2}}
+        test_print_param %p : !transform.param<i64>
+        %2 = replicate num(%0) %1 : !transform.any_op, !transform.any_op
+        %p2 = num_associations %2 : (!transform.any_op) -> !transform.param<i64>
+        // expected-remark @below {{4}}
+        test_print_param %p2 : !transform.param<i64>
+      }
     }
+    transform.yield
   }
 }
 
@@ -698,24 +802,29 @@ func.func @bar() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @const : benefit(1) {
-    %r = pdl.types
-    %0 = pdl.operation "arith.constant" -> (%r : !pdl.range<type>)
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %f = pdl_match @const in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.foreach %f : !transform.any_op {
-    ^bb2(%arg2: !transform.any_op):
-      %p = transform.num_associations %arg2 : (!transform.any_op) -> !transform.param<i64>
-      // expected-remark @below {{1}}
-      transform.test_print_param %p : !transform.param<i64>
-      transform.test_print_remark_at_operand %arg2, "transform applied" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @const : benefit(1) {
+        %r = pdl.types
+        %0 = pdl.operation "arith.constant" -> (%r : !pdl.range<type>)
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %f = pdl_match @const in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.foreach %f : !transform.any_op {
+        ^bb2(%arg2: !transform.any_op):
+          %p = transform.num_associations %arg2 : (!transform.any_op) -> !transform.param<i64>
+          // expected-remark @below {{1}}
+          transform.test_print_param %p : !transform.param<i64>
+          transform.test_print_remark_at_operand %arg2, "transform applied" : !transform.any_op
+        }
+      }
     }
+    transform.yield
   }
 }
 
@@ -731,13 +840,15 @@ func.func @consume_in_foreach() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %f = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.foreach %f : !transform.any_op {
-  ^bb2(%arg2: !transform.any_op):
-    // expected-remark @below {{erasing}}
-    transform.test_emit_remark_and_erase_operand %arg2, "erasing" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %f = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.foreach %f : !transform.any_op {
+    ^bb2(%arg2: !transform.any_op):
+      // expected-remark @below {{erasing}}
+      transform.test_emit_remark_and_erase_operand %arg2, "erasing" : !transform.any_op
+    }
+    transform.yield
   }
 }
 
@@ -761,33 +872,38 @@ func.func @bar() {
   return
 }
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @const : benefit(1) {
-    %r = pdl.types
-    %0 = pdl.operation "arith.constant" -> (%r : !pdl.range<type>)
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  pdl.pattern @execute_region : benefit(1) {
-    %r = pdl.types
-    %0 = pdl.operation "scf.execute_region" -> (%r : !pdl.range<type>)
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %f = pdl_match @execute_region in %arg1 : (!transform.any_op) -> !transform.any_op
-    %results = transform.foreach %f : !transform.any_op -> !transform.any_op {
-    ^bb2(%arg2: !transform.any_op):
-      %g = transform.pdl_match @const in %arg2 : (!transform.any_op) -> !transform.any_op
-      transform.yield %g : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @const : benefit(1) {
+        %r = pdl.types
+        %0 = pdl.operation "arith.constant" -> (%r : !pdl.range<type>)
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      pdl.pattern @execute_region : benefit(1) {
+        %r = pdl.types
+        %0 = pdl.operation "scf.execute_region" -> (%r : !pdl.range<type>)
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %f = pdl_match @execute_region in %arg1 : (!transform.any_op) -> !transform.any_op
+        %results = transform.foreach %f : !transform.any_op -> !transform.any_op {
+        ^bb2(%arg2: !transform.any_op):
+          %g = transform.pdl_match @const in %arg2 : (!transform.any_op) -> !transform.any_op
+          transform.yield %g : !transform.any_op
+        }
+
+        %p = transform.num_associations %results : (!transform.any_op) -> !transform.param<i64>
+        // expected-remark @below {{3}}
+        transform.test_print_param %p : !transform.param<i64>
+        transform.test_print_remark_at_operand %results, "transform applied" : !transform.any_op
+      }
     }
-
-    %p = transform.num_associations %results : (!transform.any_op) -> !transform.param<i64>
-    // expected-remark @below {{3}}
-    transform.test_print_param %p : !transform.param<i64>
-    transform.test_print_remark_at_operand %results, "transform applied" : !transform.any_op
+    transform.yield
   }
 }
 
@@ -800,11 +916,13 @@ func.func @get_parent_for_op_no_loop(%arg0: index, %arg1: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %muli = get_producer_of_operand %addi[0] : (!transform.any_op) -> !transform.any_op
-  transform.test_print_remark_at_operand %muli, "found muli" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %muli = transform.get_producer_of_operand %addi[0] : (!transform.any_op) -> !transform.any_op
+    transform.test_print_remark_at_operand %muli, "found muli" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -815,12 +933,13 @@ func.func @get_parent_for_op_no_loop(%arg0: index, %arg1: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{could not find a producer for operand number: 0 of}}
-  %bbarg = get_producer_of_operand %muli[0] : (!transform.any_op) -> !transform.any_op
-
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{could not find a producer for operand number: 0 of}}
+    %bbarg = transform.get_producer_of_operand %muli[0] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -832,11 +951,13 @@ func.func @get_consumer(%arg0: index, %arg1: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %addi = get_consumers_of_result %muli[0] : (!transform.any_op) -> !transform.any_op
-  transform.test_print_remark_at_operand %addi, "found addi" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %addi = transform.get_consumers_of_result %muli[0] : (!transform.any_op) -> !transform.any_op
+    transform.test_print_remark_at_operand %addi, "found addi" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -847,12 +968,13 @@ func.func @get_consumer_fail_1(%arg0: index, %arg1: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{handle must be mapped to exactly one payload op}}
-  %bbarg = get_consumers_of_result %muli[0] : (!transform.any_op) -> !transform.any_op
-
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{handle must be mapped to exactly one payload op}}
+    %bbarg = transform.get_consumers_of_result %muli[0] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -862,12 +984,13 @@ func.func @get_consumer_fail_2(%arg0: index, %arg1: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{result number overflow}}
-  %bbarg = get_consumers_of_result %muli[1] : (!transform.any_op) -> !transform.any_op
-
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{result number overflow}}
+    %bbarg = transform.get_consumers_of_result %muli[1] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -878,16 +1001,18 @@ func.func @split_handle(%a: index, %b: index, %c: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%fun: !transform.any_op):
-  %muli = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
-  %h:2 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{1}}
-  transform.test_print_param %p : !transform.param<i64>
-  %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{expected to contain 3 payload ops but it contains 2 payload ops}}
-  %h_2:3 = split_handle %muli_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%fun: !transform.any_op) {
+    %muli = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
+    %h:2 = transform.split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{1}}
+    transform.test_print_param %p : !transform.param<i64>
+    %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{expected to contain 3 payload ops but it contains 2 payload ops}}
+    %h_2:3 = transform.split_handle %muli_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 
 // -----
@@ -898,19 +1023,24 @@ func.func @split_handle(%a: index, %b: index, %c: index) {
   return
 }
 
-transform.sequence failures(suppress) {
-^bb1(%fun: !transform.any_op):
-  %muli = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
-  %h:2 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{1}}
-  transform.test_print_param %p : !transform.param<i64>
-  %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
-  // Silenceable failure and all handles are now empty.
-  %h_2:3 = split_handle %muli_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  %p2 = transform.num_associations %h_2#0 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{0}}
-  transform.test_print_param %p2 : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.sequence %root : !transform.any_op failures(suppress) {
+    ^bb1(%fun: !transform.any_op):
+      %muli = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
+      %h:2 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
+      // expected-remark @below {{1}}
+      transform.test_print_param %p : !transform.param<i64>
+      %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
+      // Silenceable failure and all handles are now empty.
+      %h_2:3 = split_handle %muli_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %p2 = transform.num_associations %h_2#0 : (!transform.any_op) -> !transform.param<i64>
+      // expected-remark @below {{0}}
+      transform.test_print_param %p2 : !transform.param<i64>
+    }
+    transform.yield
+  }
 }
 
 // -----
@@ -921,20 +1051,22 @@ func.func @split_handle(%a: index, %b: index, %c: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%fun: !transform.any_op):
-  %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
-  // No error, last result handle is empty.
-  %h:3 = split_handle %muli_2 {fail_on_payload_too_small = false} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{1}}
-  transform.test_print_param %p : !transform.param<i64>
-  %p2 = transform.num_associations %h#1 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{1}}
-  transform.test_print_param %p2 : !transform.param<i64>
-  %p3 = transform.num_associations %h#2 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{0}}
-  transform.test_print_param %p3 : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%fun: !transform.any_op) {
+    %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
+    // No error, last result handle is empty.
+    %h:3 = transform.split_handle %muli_2 {fail_on_payload_too_small = false} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{1}}
+    transform.test_print_param %p : !transform.param<i64>
+    %p2 = transform.num_associations %h#1 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{1}}
+    transform.test_print_param %p2 : !transform.param<i64>
+    %p3 = transform.num_associations %h#2 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{0}}
+    transform.test_print_param %p3 : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
@@ -947,16 +1079,18 @@ func.func @split_handle(%a: index, %b: index, %c: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%fun: !transform.any_op):
-  %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
-  %h:2 = split_handle %muli_2 {overflow_result = 0} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{3}}
-  transform.test_print_param %p : !transform.param<i64>
-  %p2 = transform.num_associations %h#1 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{1}}
-  transform.test_print_param %p2 : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%fun: !transform.any_op) {
+    %muli_2 = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
+    %h:2 = transform.split_handle %muli_2 {overflow_result = 0} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %p = transform.num_associations %h#0 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{3}}
+    transform.test_print_param %p : !transform.param<i64>
+    %p2 = transform.num_associations %h#1 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{1}}
+    transform.test_print_param %p2 : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
@@ -964,18 +1098,23 @@ transform.sequence failures(propagate) {
 "test.some_op"() : () -> ()
 "other_dialect.other_op"() : () -> ()
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    %2 = transform.cast %0 : !transform.any_op to !transform.test_dialect_op
-    transform.cast %2 : !transform.test_dialect_op to !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        %2 = transform.cast %0 : !transform.any_op to !transform.test_dialect_op
+        transform.cast %2 : !transform.test_dialect_op to !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -984,19 +1123,24 @@ transform.with_pdl_patterns {
 "test.some_op"() : () -> ()
 "other_dialect.other_op"() : () -> ()
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @other : benefit(1) {
-    %0 = pdl.operation "other_dialect.other_op"
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @other in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{expected the payload operation to belong to the 'test' dialect}}
-    %2 = transform.cast %0 : !transform.any_op to !transform.test_dialect_op
-    transform.cast %2 : !transform.test_dialect_op to !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @other : benefit(1) {
+        %0 = pdl.operation "other_dialect.other_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @other in %arg1 : (!transform.any_op) -> !transform.any_op
+        // expected-error @below {{expected the payload operation to belong to the 'test' dialect}}
+        %2 = transform.cast %0 : !transform.any_op to !transform.test_dialect_op
+        transform.cast %2 : !transform.test_dialect_op to !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -1005,18 +1149,23 @@ transform.with_pdl_patterns {
 "test.some_op"() : () -> ()
 "other_dialect.other_op"() : () -> ()
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    %2 = transform.cast %0 : !transform.any_op to !transform.op<"test.some_op">
-    transform.cast %2 : !transform.op<"test.some_op"> to !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        %2 = transform.cast %0 : !transform.any_op to !transform.op<"test.some_op">
+        transform.cast %2 : !transform.op<"test.some_op"> to !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
@@ -1026,42 +1175,52 @@ transform.with_pdl_patterns {
 // expected-note @below {{payload operation}}
 "other_dialect.other_op"() : () -> ()
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @other : benefit(1) {
-    %0 = pdl.operation "other_dialect.other_op"
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @other in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{incompatible payload operation name}}
-    %2 = transform.cast %0 : !transform.any_op to !transform.op<"test.some_op">
-    transform.cast %2 : !transform.op<"test.some_op"> to !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @other : benefit(1) {
+        %0 = pdl.operation "other_dialect.other_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @other in %arg1 : (!transform.any_op) -> !transform.any_op
+        // expected-error @below {{incompatible payload operation name}}
+        %2 = transform.cast %0 : !transform.any_op to !transform.op<"test.some_op">
+        transform.cast %2 : !transform.op<"test.some_op"> to !transform.any_op
+      }
+    }
+    transform.yield
   }
 }
 
 // -----
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  transform.sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    // here, the handles nested under are {%arg0, %arg1, %0}
-    // expected-remark @below {{3 handles nested under}}
-    transform.test_report_number_of_tracked_handles_nested_under %arg1 : !transform.any_op
-    // expected-remark @below {{erased}}
-    transform.test_emit_remark_and_erase_operand %0, "erased" : !transform.any_op
-    // here, the handles nested under are only {%arg0, %arg1}
-    // expected-remark @below {{2 handles nested under}}
-    transform.test_report_number_of_tracked_handles_nested_under %arg1 : !transform.any_op
-  }
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      transform.sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        // here, the handles nested under are {%root, %arg0, %arg1, %0}
+        // expected-remark @below {{4 handles nested under}}
+        transform.test_report_number_of_tracked_handles_nested_under %arg1 : !transform.any_op
+        // expected-remark @below {{erased}}
+        transform.test_emit_remark_and_erase_operand %0, "erased" : !transform.any_op
+        // here, the handles nested under are only {%root, %arg0, %arg1}
+        // expected-remark @below {{3 handles nested under}}
+        transform.test_report_number_of_tracked_handles_nested_under %arg1 : !transform.any_op
+      }
 
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.rewrite %0 with "transform.dialect"
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+    }
+    transform.yield
   }
 }
 
@@ -1075,66 +1234,84 @@ func.func @split_handle(%a: index, %b: index, %c: index) {
   return
 }
 
-transform.sequence -> !transform.any_op failures(propagate) {
-^bb1(%fun: !transform.any_op):
-  %muli = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{expected to contain 3 payload ops but it contains 2 payload ops}}
-  %h_2:3 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  /// Test that yield does not crash in the presence of silenceable error in
-  /// propagate mode.
-  yield %fun : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.sequence %root : !transform.any_op -> !transform.any_op failures(propagate) {
+    ^bb1(%fun: !transform.any_op):
+      %muli = transform.structured.match ops{["arith.muli"]} in %fun : (!transform.any_op) -> !transform.any_op
+      // expected-error @below {{expected to contain 3 payload ops but it contains 2 payload ops}}
+      %h_2:3 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      /// Test that yield does not crash in the presence of silenceable error in
+      /// propagate mode.
+      yield %fun : !transform.any_op
+    }
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence -> !transform.any_op failures(suppress) {
-^bb0(%arg0: !transform.any_op):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // Edge case propagating empty handles in splitting.
-  %0:3 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  // Test does not crash when accessing the empty handle.
-  yield %0#0 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.sequence %root : !transform.any_op -> !transform.any_op failures(suppress) {
+    ^bb0(%arg0: !transform.any_op):
+      %muli = transform.structured.match ops{["arith.muli"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      // Edge case propagating empty handles in splitting.
+      %0:3 = split_handle %muli : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      // Test does not crash when accessing the empty handle.
+      yield %0#0 : !transform.any_op
+    }
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_param (0 : i32) : !transform.test_dialect_param
-  // expected-remark @below {{0 : i32}}
-  transform.test_print_param %0 : !transform.test_dialect_param
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_produce_param (0 : i32) : !transform.test_dialect_param
+    // expected-remark @below {{0 : i32}}
+    transform.test_print_param %0 : !transform.test_dialect_param
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-error @below {{expected the type of the parameter attribute ('i32') to match the parameter type ('i64')}}
-  transform.test_produce_param (0 : i32) : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-error @below {{expected the type of the parameter attribute ('i32') to match the parameter type ('i64')}}
+    transform.test_produce_param (0 : i32) : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
 
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_add_to_param 40
-  %1 = transform.test_add_to_param %0, 2
-  // expected-remark @below {{42 : i32}}
-  transform.test_print_param %1 : !transform.test_dialect_param
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_add_to_param 40
+    %1 = transform.test_add_to_param %0, 2
+    // expected-remark @below {{42 : i32}}
+    transform.test_print_param %1 : !transform.test_dialect_param
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.test_produce_param_with_number_of_test_ops %0 : !transform.any_op
-  // expected-remark @below {{1 : i32, 3 : i32}}
-  transform.test_print_param %1 : !transform.test_dialect_param
-  %2 = transform.test_add_to_param %1, 100
-  // expected-remark @below {{101 : i32, 103 : i32}}
-  transform.test_print_param %2 : !transform.test_dialect_param
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.test_produce_param_with_number_of_test_ops %0 : !transform.any_op
+    // expected-remark @below {{1 : i32, 3 : i32}}
+    transform.test_print_param %1 : !transform.test_dialect_param
+    %2 = transform.test_add_to_param %1, 100
+    // expected-remark @below {{101 : i32, 103 : i32}}
+    transform.test_print_param %2 : !transform.test_dialect_param
+    transform.yield
+  }
 }
 
 func.func private @one_test_op(%arg0: i32) {
@@ -1152,13 +1329,13 @@ func.func private @three_test_ops(%arg0: i32) {
 // -----
 
 // expected-note @below {{when applied to this op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below {{expected to produce an Operation * for result #0}}
     transform.test_produce_transform_param_or_forward_operand %arg0
       { first_result_is_param }
       : (!transform.any_op) -> (!transform.any_op, !transform.param<i64>)
+    transform.yield
   }
 }
 
@@ -1166,92 +1343,88 @@ module {
 
 // Should not fail.
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     transform.test_produce_transform_param_or_forward_operand %arg0
       { first_result_is_null }
       : (!transform.any_op) -> (!transform.any_op, !transform.param<i64>)
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{when applied to this op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below {{expected to produce an Attribute for result #1}}
     transform.test_produce_transform_param_or_forward_operand %arg0
       { second_result_is_handle }
       : (!transform.any_op) -> (!transform.any_op, !transform.param<i64>)
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{when applied to this op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below {{expected to produce a Value for result #0}}
     transform.test_produce_transform_param_or_forward_operand %arg0
       { second_result_is_handle }
       : (!transform.any_op) -> (!transform.any_value, !transform.param<i64>)
+    transform.yield
   }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-error @below {{attempting to assign a null payload op to this transform value}}
-  %0 = transform.test_produce_null_payload : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-error @below {{attempting to assign a null payload op to this transform value}}
+    %0 = transform.test_produce_null_payload : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-error @below {{attempting to assign a null parameter to this transform value}}
-  %0 = transform.test_produce_null_param : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-error @below {{attempting to assign a null parameter to this transform value}}
+    %0 = transform.test_produce_null_param : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-error @below {{attempting to assign a null payload value to this transform handle}}
-  %0 = transform.test_produce_null_value : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-error @below {{attempting to assign a null payload value to this transform handle}}
+    %0 = transform.test_produce_null_value : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
 
-// expected-error @below {{could not find a nested top-level transform op}}
-// expected-note @below {{use the 'transform-file-name' option to provide transform as external file}}
+// expected-error @below {{could not find a nested named sequence with name: __transform_main}}
+// expected-error @below {{could not find transform entry point: __transform_main in either payload or transform module}}
 module {
 }
 
 // -----
 
-// expected-note @below {{previous top-level transform op}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-}
-
-// expected-error @below {{more than one top-level transform op}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-}
-
-// -----
-
-transform.sequence failures(propagate) {
-// expected-remark @below {{value handle}}
-// expected-note @below {{value handle points to a block argument #0 in block #0 in region #0}}
-^bb1(%arg0: !transform.any_op):
-  %0 = test_produce_value_handle_to_self_operand %arg0 : (!transform.any_op) -> !transform.any_value
-  test_print_remark_at_operand_value %0, "value handle" : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  // expected-remark @below {{value handle}}
+  // expected-note @below {{value handle points to a block argument #0 in block #0 in region #0}}
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.test_produce_value_handle_to_self_operand %arg0 : (!transform.any_op) -> !transform.any_value
+    transform.test_print_remark_at_operand_value %0, "value handle" : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -1263,11 +1436,13 @@ transform.sequence failures(propagate) {
 // expected-note @below {{value handle points to an op result #1}}
 %1:3 = "test.get_three_results"() : () -> (i32, i32, f32)
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %2 = transform.structured.match ops{["test.get_two_results", "test.get_three_results"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %3 = test_produce_value_handle_to_result %2, 1 : (!transform.any_op) -> !transform.any_value
-  test_print_remark_at_operand_value %3, "result handle" : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %2 = transform.structured.match ops{["test.get_two_results", "test.get_three_results"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %3 = transform.test_produce_value_handle_to_result %2, 1 : (!transform.any_op) -> !transform.any_value
+    transform.test_print_remark_at_operand_value %3, "result handle" : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -1285,21 +1460,25 @@ transform.sequence failures(propagate) {
   "test.regon_terminator"() : () -> ()
 }) : () -> ()
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %2 = transform.structured.match ops{["test.match_anchor"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %3 = test_produce_value_handle_to_argument_of_parent_block %2, 2 : (!transform.any_op) -> !transform.any_value
-  test_print_remark_at_operand_value %3, "block argument handle" : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %2 = transform.structured.match ops{["test.match_anchor"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %3 = transform.test_produce_value_handle_to_argument_of_parent_block %2, 2 : (!transform.any_op) -> !transform.any_value
+    transform.test_print_remark_at_operand_value %3, "block argument handle" : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-note @below {{value defined here with type '!transform.test_dialect_param'}}
-  %0 = test_produce_param_with_number_of_test_ops %arg0 : !transform.any_op
-  // expected-error @below {{unexpectedly consumed a value that is not a handle as operand #0}}
-  test_consume_operand %0 : !transform.test_dialect_param
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-note @below {{value defined here with type '!transform.test_dialect_param'}}
+    %0 = transform.test_produce_param_with_number_of_test_ops %arg0 : !transform.any_op
+    // expected-error @below {{unexpectedly consumed a value that is not a handle as operand #0}}
+    transform.test_consume_operand %0 : !transform.test_dialect_param
+    transform.yield
+  }
 }
 
 // -----
@@ -1311,11 +1490,13 @@ func.func @get_result_of_op(%arg0: index, %arg1: index) -> index {
   return %r : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %result = transform.get_result %addi[0] : (!transform.any_op) -> !transform.any_value
-  transform.test_print_remark_at_operand_value %result, "addi result" : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %result = transform.get_result %addi[0] : (!transform.any_op) -> !transform.any_value
+    transform.test_print_remark_at_operand_value %result, "addi result" : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -1326,12 +1507,14 @@ func.func @get_out_of_bounds_result_of_op(%arg0: index, %arg1: index) -> index {
   return %r : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{targeted op does not have enough results}}
-  %result = transform.get_result %addi[1] : (!transform.any_op) -> !transform.any_value
-  transform.test_print_remark_at_operand_value %result, "addi result" : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{targeted op does not have enough results}}
+    %result = transform.get_result %addi[1] : (!transform.any_op) -> !transform.any_value
+    transform.test_print_remark_at_operand_value %result, "addi result" : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -1342,12 +1525,14 @@ func.func @get_result_of_op(%arg0: index, %arg1: index) -> index {
   return %r : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %result = transform.get_result %addi[0] : (!transform.any_op) -> !transform.any_value
-  %op = transform.get_defining_op %result : (!transform.any_value) -> !transform.any_op
-  transform.test_print_remark_at_operand %op, "matched" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %result = transform.get_result %addi[0] : (!transform.any_op) -> !transform.any_value
+    %op = transform.get_defining_op %result : (!transform.any_value) -> !transform.any_op
+    transform.test_print_remark_at_operand %op, "matched" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -1358,13 +1543,15 @@ func.func @get_result_of_op_bbarg(%arg0: index, %arg1: index) -> index {
   return %r : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %bbarg = test_produce_value_handle_to_argument_of_parent_block %addi, 0 : (!transform.any_op) -> !transform.any_value
-  // expected-error @below {{cannot get defining op of block argument}}
-  %op = transform.get_defining_op %bbarg : (!transform.any_value) -> !transform.any_op
-  transform.test_print_remark_at_operand %op, "matched" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %bbarg = transform.test_produce_value_handle_to_argument_of_parent_block %addi, 0 : (!transform.any_op) -> !transform.any_value
+    // expected-error @below {{cannot get defining op of block argument}}
+    %op = transform.get_defining_op %bbarg : (!transform.any_value) -> !transform.any_op
+    transform.test_print_remark_at_operand %op, "matched" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -1377,9 +1564,9 @@ module @named_inclusion attributes { transform.with_named_sequence } {
     transform.yield
   }
 
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    include @foo failures(propagate) (%arg0) : (!transform.any_op) -> ()
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.include @foo failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.yield
   }
 }
 
@@ -1400,9 +1587,9 @@ module @named_inclusion_in_named attributes { transform.with_named_sequence } {
     transform.yield
   }
 
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     transform.include @bar failures(suppress) (%arg0) : (!transform.any_op) -> ()
+    transform.yield
   }
 }
 
@@ -1418,12 +1605,12 @@ module @named_operands attributes { transform.with_named_sequence } {
     transform.yield
   }
 
-  transform.sequence failures(propagate) {
   // expected-remark @below {{value}}
   // expected-note @below {{value handle points to a block argument #0 in block #0 in region #0}}
-  ^bb0(%arg0: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     %0 = transform.test_produce_value_handle_to_self_operand %arg0 : (!transform.any_op) -> !transform.any_value
-    include @foo failures(propagate) (%arg0, %0) : (!transform.any_op, !transform.any_value) -> ()
+    transform.include @foo failures(propagate) (%arg0, %0) : (!transform.any_op, !transform.any_value) -> ()
+    transform.yield
   }
 }
 
@@ -1439,11 +1626,11 @@ module @named_return attributes { transform.with_named_sequence } {
     transform.yield %arg0, %0 : !transform.any_op, !transform.any_value
   }
 
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    %0:2 = include @foo failures(propagate) (%arg0) : (!transform.any_op) -> (!transform.any_op, !transform.any_value)
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0:2 = transform.include @foo failures(propagate) (%arg0) : (!transform.any_op) -> (!transform.any_op, !transform.any_value)
     transform.test_print_remark_at_operand %0#0, "operation" : !transform.any_op
     transform.test_print_remark_at_operand_value %0#1, "value" : !transform.any_value
+    transform.yield
   }
 }
 
@@ -1469,8 +1656,7 @@ module attributes { transform.with_named_sequence } {
     transform.yield
   }
 
-  transform.sequence failures(propagate) {
-  ^bb0(%root: !transform.any_op):
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
     transform.foreach_match in %root
         @match1 -> @action1,
         @match2 -> @action2
@@ -1493,11 +1679,11 @@ module attributes { transform.with_named_sequence } {
   transform.named_sequence @match(!transform.any_op {transform.readonly})
   transform.named_sequence @action()
 
-  transform.sequence failures(propagate) {
-  ^bb0(%root: !transform.any_op):
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
     // expected-error @below {{unresolved external symbol @match}}
     transform.foreach_match in %root
       @match -> @action : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
 
@@ -1509,11 +1695,11 @@ module attributes { transform.with_named_sequence } {
   }
   transform.named_sequence @action()
 
-  transform.sequence failures(propagate) {
-  ^bb0(%root: !transform.any_op):
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
     // expected-error @below {{unresolved external symbol @action}}
     transform.foreach_match in %root
       @match -> @action : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
 
@@ -1529,17 +1715,17 @@ module attributes { transform.with_named_sequence } {
     transform.yield
   }
 
-  transform.sequence failures(propagate) {
-  ^bb0(%root: !transform.any_op):
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
     transform.foreach_match in %root
       @match -> @action : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
 module attributes { transform.with_named_sequence } {
-  transform.named_sequence @match_func(%arg0: !transform.any_op {transform.readonly}) 
+  transform.named_sequence @match_func(%arg0: !transform.any_op {transform.readonly})
     -> !transform.any_op {
     transform.match.operation_name %arg0 ["func.func"] : !transform.any_op
     transform.yield %arg0 : !transform.any_op
@@ -1550,8 +1736,7 @@ module attributes { transform.with_named_sequence } {
     transform.yield
   }
 
-  transform.sequence failures(propagate) {
-  ^bb(%arg0: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     transform.foreach_match in %arg0 @match_func -> @print_func : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -1570,7 +1755,7 @@ module attributes { transform.with_named_sequence } {
 // -----
 
 module attributes { transform.with_named_sequence } {
-  transform.named_sequence @eq_1(%arg0: !transform.any_op {transform.readonly}) 
+  transform.named_sequence @eq_1(%arg0: !transform.any_op {transform.readonly})
     -> !transform.any_op {
     transform.match.operation_name %arg0 ["func.func"] : !transform.any_op
     %0 = transform.test_produce_param_with_number_of_test_ops %arg0 : !transform.any_op
@@ -1580,7 +1765,7 @@ module attributes { transform.with_named_sequence } {
     transform.yield %arg0 : !transform.any_op
   }
 
-  transform.named_sequence @ne_0(%arg0: !transform.any_op {transform.readonly}) 
+  transform.named_sequence @ne_0(%arg0: !transform.any_op {transform.readonly})
     -> !transform.any_op {
     transform.match.operation_name %arg0 ["func.func"] : !transform.any_op
     %0 = transform.test_produce_param_with_number_of_test_ops %arg0 : !transform.any_op
@@ -1590,7 +1775,7 @@ module attributes { transform.with_named_sequence } {
     transform.yield %arg0 : !transform.any_op
   }
 
-  transform.named_sequence @gt_m1(%arg0: !transform.any_op {transform.readonly}) 
+  transform.named_sequence @gt_m1(%arg0: !transform.any_op {transform.readonly})
     -> !transform.any_op {
     transform.match.operation_name %arg0 ["func.func"] : !transform.any_op
     %0 = transform.test_produce_param_with_number_of_test_ops %arg0 : !transform.any_op
@@ -1600,7 +1785,7 @@ module attributes { transform.with_named_sequence } {
     transform.yield %arg0 : !transform.any_op
   }
 
-  transform.named_sequence @ge_1(%arg0: !transform.any_op {transform.readonly}) 
+  transform.named_sequence @ge_1(%arg0: !transform.any_op {transform.readonly})
     -> !transform.any_op {
     transform.match.operation_name %arg0 ["func.func"] : !transform.any_op
     %0 = transform.test_produce_param_with_number_of_test_ops %arg0 : !transform.any_op
@@ -1610,7 +1795,7 @@ module attributes { transform.with_named_sequence } {
     transform.yield %arg0 : !transform.any_op
   }
 
-  transform.named_sequence @lt_1(%arg0: !transform.any_op {transform.readonly}) 
+  transform.named_sequence @lt_1(%arg0: !transform.any_op {transform.readonly})
     -> !transform.any_op {
     transform.match.operation_name %arg0 ["func.func"] : !transform.any_op
     %0 = transform.test_produce_param_with_number_of_test_ops %arg0 : !transform.any_op
@@ -1620,7 +1805,7 @@ module attributes { transform.with_named_sequence } {
     transform.yield %arg0 : !transform.any_op
   }
 
-  transform.named_sequence @le_1(%arg0: !transform.any_op {transform.readonly}) 
+  transform.named_sequence @le_1(%arg0: !transform.any_op {transform.readonly})
     -> !transform.any_op {
     transform.match.operation_name %arg0 ["func.func"] : !transform.any_op
     %0 = transform.test_produce_param_with_number_of_test_ops %arg0 : !transform.any_op
@@ -1634,8 +1819,7 @@ module attributes { transform.with_named_sequence } {
     transform.yield
   }
 
-  transform.sequence failures(propagate) {
-  ^bb(%arg0: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     %0 = transform.foreach_match in %arg0 @eq_1 -> @do_nothing : (!transform.any_op) -> !transform.any_op
     %1 = transform.foreach_match in %0 @ne_0 -> @do_nothing : (!transform.any_op) -> !transform.any_op
     %2 = transform.foreach_match in %1 @gt_m1 -> @do_nothing : (!transform.any_op) -> !transform.any_op
@@ -1675,47 +1859,49 @@ func.func @test_tracked_rewrite() {
   func.return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["transform.test_dummy_payload_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-remark @below {{2 iterations}}
-  transform.test_tracked_rewrite %0 : (!transform.any_op) -> ()
-  // One replacement op (test.drop_mapping) is dropped from the mapping.
-  %p = num_associations %0 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below {{2}}
-  test_print_param %p : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["transform.test_dummy_payload_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-remark @below {{2 iterations}}
+    transform.test_tracked_rewrite %0 : (!transform.any_op) -> ()
+    // One replacement op (test.drop_mapping) is dropped from the mapping.
+    %p = transform.num_associations %0 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{2}}
+    transform.test_print_param %p : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
 
 // Parameter deduplication happens by value
 
-module {
+module attributes {transform.with_named_sequence} {
 
-  transform.sequence failures(propagate) {
-  ^bb0(%0: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     %1 = transform.param.constant 1 -> !transform.param<i64>
     %2 = transform.param.constant 1 -> !transform.param<i64>
     %3 = transform.param.constant 2 -> !transform.param<i64>
     %4 = transform.merge_handles %1, %2 { deduplicate } : !transform.param<i64>
-    %p = num_associations %4 : (!transform.param<i64>) -> !transform.param<i64>
+    %p = transform.num_associations %4 : (!transform.param<i64>) -> !transform.param<i64>
     // expected-remark @below {{1}}
-    test_print_param %p : !transform.param<i64>
+    transform.test_print_param %p : !transform.param<i64>
 
     %5 = transform.merge_handles %1, %1 { deduplicate } : !transform.param<i64>
-    %p2 = num_associations %5 : (!transform.param<i64>) -> !transform.param<i64>
+    %p2 = transform.num_associations %5 : (!transform.param<i64>) -> !transform.param<i64>
     // expected-remark @below {{1}}
-    test_print_param %p2 : !transform.param<i64>
+    transform.test_print_param %p2 : !transform.param<i64>
 
     %6 = transform.merge_handles %1, %3 { deduplicate } : !transform.param<i64>
-    %p3 = num_associations %6 : (!transform.param<i64>) -> !transform.param<i64>
+    %p3 = transform.num_associations %6 : (!transform.param<i64>) -> !transform.param<i64>
     // expected-remark @below {{2}}
-    test_print_param %p3 : !transform.param<i64>
+    transform.test_print_param %p3 : !transform.param<i64>
 
     %7 = transform.merge_handles %1, %1, %2, %3 : !transform.param<i64>
-    %p4 = num_associations %7 : (!transform.param<i64>) -> !transform.param<i64>
+    %p4 = transform.num_associations %7 : (!transform.param<i64>) -> !transform.param<i64>
     // expected-remark @below {{4}}
-    test_print_param %p4 : !transform.param<i64>
+    transform.test_print_param %p4 : !transform.param<i64>
+    transform.yield
   }
 }
 
@@ -1723,32 +1909,34 @@ module {
 
 %0:3 = "test.get_two_results"() : () -> (i32, i32, f32)
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %1 = transform.structured.match ops{["test.get_two_results"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %2 = test_produce_value_handle_to_result %1, 0 : (!transform.any_op) -> !transform.any_value
-  %3 = test_produce_value_handle_to_result %1, 1 : (!transform.any_op) -> !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %1 = transform.structured.match ops{["test.get_two_results"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.test_produce_value_handle_to_result %1, 0 : (!transform.any_op) -> !transform.any_value
+    %3 = transform.test_produce_value_handle_to_result %1, 1 : (!transform.any_op) -> !transform.any_value
 
-  %4 = transform.merge_handles %2, %2 { deduplicate } : !transform.any_value
-  %p = num_associations %4 : (!transform.any_value) -> !transform.param<i64>
-  // expected-remark @below {{1}}
-  test_print_param %p : !transform.param<i64>
+    %4 = transform.merge_handles %2, %2 { deduplicate } : !transform.any_value
+    %p = transform.num_associations %4 : (!transform.any_value) -> !transform.param<i64>
+    // expected-remark @below {{1}}
+    transform.test_print_param %p : !transform.param<i64>
 
-  %5 = transform.merge_handles %2, %3 { deduplicate } : !transform.any_value
-  %p2 = num_associations %5 : (!transform.any_value) -> !transform.param<i64>
-  // expected-remark @below {{2}}
-  test_print_param %p2 : !transform.param<i64>
+    %5 = transform.merge_handles %2, %3 { deduplicate } : !transform.any_value
+    %p2 = transform.num_associations %5 : (!transform.any_value) -> !transform.param<i64>
+    // expected-remark @below {{2}}
+    transform.test_print_param %p2 : !transform.param<i64>
 
-  %6 = test_produce_value_handle_to_result %1, 0 : (!transform.any_op) -> !transform.any_value
-  %7 = transform.merge_handles %2, %6 { deduplicate } : !transform.any_value
-  %p3 = num_associations %6 : (!transform.any_value) -> !transform.param<i64>
-  // expected-remark @below {{1}}
-  test_print_param %p3 : !transform.param<i64>
+    %6 = transform.test_produce_value_handle_to_result %1, 0 : (!transform.any_op) -> !transform.any_value
+    %7 = transform.merge_handles %2, %6 { deduplicate } : !transform.any_value
+    %p3 = transform.num_associations %6 : (!transform.any_value) -> !transform.param<i64>
+    // expected-remark @below {{1}}
+    transform.test_print_param %p3 : !transform.param<i64>
 
-  %8 = transform.merge_handles %2, %2, %3, %4 : !transform.any_value
-  %p4 = num_associations %8 : (!transform.any_value) -> !transform.param<i64>
-  // expected-remark @below {{4}}
-  test_print_param %p4 : !transform.param<i64>
+    %8 = transform.merge_handles %2, %2, %3, %4 : !transform.any_value
+    %p4 = transform.num_associations %8 : (!transform.any_value) -> !transform.param<i64>
+    // expected-remark @below {{4}}
+    transform.test_print_param %p4 : !transform.param<i64>
+    transform.yield
+  }
 }
 // -----
 
@@ -1775,18 +1963,20 @@ func.func @test_annotation() {
   %2 = "test.annotate_me"() {new_attr = 0} : () -> (i1)
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["test.annotate_me"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.test_produce_param_with_number_of_test_ops %0 : !transform.any_op
-  transform.annotate %0 "new_attr" = %1 : !transform.any_op, !transform.test_dialect_param
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.annotate_me"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.test_produce_param_with_number_of_test_ops %0 : !transform.any_op
+    transform.annotate %0 "new_attr" = %1 : !transform.any_op, !transform.test_dialect_param
 
-  %2 = transform.param.constant 2 -> !transform.param<i64>
-  transform.annotate %0 "broadcast_attr" = %2 : !transform.any_op, !transform.param<i64>
-  transform.annotate %0 "unit_attr" : !transform.any_op
+    %2 = transform.param.constant 2 -> !transform.param<i64>
+    transform.annotate %0 "broadcast_attr" = %2 : !transform.any_op, !transform.param<i64>
+    transform.annotate %0 "unit_attr" : !transform.any_op
 
-  %3 = transform.param.constant "example" -> !transform.any_param
-  transform.annotate %0 "any_attr" = %3 : !transform.any_op, !transform.any_param
+    %3 = transform.param.constant "example" -> !transform.any_param
+    transform.annotate %0 "any_attr" = %3 : !transform.any_op, !transform.any_param
+    transform.yield
+  }
 }
 
 // -----
@@ -1798,12 +1988,14 @@ func.func @notify_payload_op_replaced(%arg0: index, %arg1: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match attributes{original} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match attributes{replacement} in %arg1 : (!transform.any_op) -> !transform.any_op
-  test_notify_payload_op_replaced %0, %1 : (!transform.any_op, !transform.any_op) -> ()
-  test_print_remark_at_operand %0, "updated handle" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match attributes{original} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match attributes{replacement} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.test_notify_payload_op_replaced %0, %1 : (!transform.any_op, !transform.any_op) -> ()
+    transform.test_print_remark_at_operand %0, "updated handle" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -1832,47 +2024,49 @@ func.func @test_apply_cse() -> (index, index, index) {
   return %0, %1, %3 : index, index, index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %first = transform.structured.match attributes{first} in %0 : (!transform.any_op) -> !transform.any_op
-  %elim_first = transform.structured.match ops{["arith.constant"]} in %first : (!transform.any_op) -> !transform.any_op
-  %second = transform.structured.match attributes{first} in %0 : (!transform.any_op) -> !transform.any_op
-  %elim_second = transform.structured.match ops{["arith.constant"]} in %first : (!transform.any_op) -> !transform.any_op
-
-  // There are 3 arith.constant ops.
-  %all = transform.structured.match ops{["arith.constant"]} in %0 : (!transform.any_op) -> !transform.any_op
-  %p = num_associations %all : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{3}}
-  test_print_param %p : !transform.param<i64>
-  // "deduplicate" has no effect because these are 3 different ops.
-  %merged_before = transform.merge_handles deduplicate %all : !transform.any_op
-  %p2 = num_associations %merged_before : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{3}}
-  test_print_param %p2 : !transform.param<i64>
-
-  // Apply CSE.
-  transform.apply_cse to %0 : !transform.any_op
-
-  // The handle is still mapped to 3 arith.constant ops.
-  %p3 = num_associations %all : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{3}}
-  test_print_param %p3 : !transform.param<i64>
-  // But they are all the same op.
-  %merged_after = transform.merge_handles deduplicate %all : !transform.any_op
-  %p4 = num_associations %merged_after : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{1}}
-  test_print_param %p4 : !transform.param<i64>
-
-  // The other handles were also updated.
-  test_print_remark_at_operand %elim_first, "eliminated 1" : !transform.any_op
-  %p5 = num_associations %elim_first : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{1}}
-  test_print_param %p5 : !transform.param<i64>
-  test_print_remark_at_operand %elim_second, "eliminated 2" : !transform.any_op
-  %p6 = num_associations %elim_second : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{1}}
-  test_print_param %p6 : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %first = transform.structured.match attributes{first} in %0 : (!transform.any_op) -> !transform.any_op
+    %elim_first = transform.structured.match ops{["arith.constant"]} in %first : (!transform.any_op) -> !transform.any_op
+    %second = transform.structured.match attributes{first} in %0 : (!transform.any_op) -> !transform.any_op
+    %elim_second = transform.structured.match ops{["arith.constant"]} in %first : (!transform.any_op) -> !transform.any_op
+
+    // There are 3 arith.constant ops.
+    %all = transform.structured.match ops{["arith.constant"]} in %0 : (!transform.any_op) -> !transform.any_op
+    %p = transform.num_associations %all : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{3}}
+    transform.test_print_param %p : !transform.param<i64>
+    // "deduplicate" has no effect because these are 3 different ops.
+    %merged_before = transform.merge_handles deduplicate %all : !transform.any_op
+    %p2 = transform.num_associations %merged_before : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{3}}
+    transform.test_print_param %p2 : !transform.param<i64>
+
+    // Apply CSE.
+    transform.apply_cse to %0 : !transform.any_op
+
+    // The handle is still mapped to 3 arith.constant ops.
+    %p3 = transform.num_associations %all : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{3}}
+    transform.test_print_param %p3 : !transform.param<i64>
+    // But they are all the same op.
+    %merged_after = transform.merge_handles deduplicate %all : !transform.any_op
+    %p4 = transform.num_associations %merged_after : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{1}}
+    transform.test_print_param %p4 : !transform.param<i64>
+
+    // The other handles were also updated.
+    transform.test_print_remark_at_operand %elim_first, "eliminated 1" : !transform.any_op
+    %p5 = transform.num_associations %elim_first : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{1}}
+    transform.test_print_param %p5 : !transform.param<i64>
+    transform.test_print_remark_at_operand %elim_second, "eliminated 2" : !transform.any_op
+    %p6 = transform.num_associations %elim_second : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{1}}
+    transform.test_print_param %p6 : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
@@ -1890,24 +2084,26 @@ func.func @test_licm(%arg0: index, %arg1: index, %arg2: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["scf.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_licm to %0 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["scf.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %0 : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
 // expected-note @below{{when applied to this op}}
-module {
+module attributes {transform.with_named_sequence} {
   func.func @test_licm_invalid() {
     return
   }
 
-  transform.sequence failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     // expected-error @below{{transform applied to the wrong op kind}}
     transform.apply_licm to %arg1 : !transform.any_op
+    transform.yield
   }
 }
 
@@ -1924,38 +2120,40 @@ func.func @get_parent_op() {
   }) : () -> ()
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.qux"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.qux"]} in %arg1 : (!transform.any_op) -> !transform.any_op
 
-  // Get parent by name.
-  %1 = transform.get_parent_op %0 {op_name = "test.foo"} : (!transform.any_op) -> !transform.any_op
-  test_print_remark_at_operand %1, "found test.foo parent" : !transform.any_op
+    // Get parent by name.
+    %1 = transform.get_parent_op %0 {op_name = "test.foo"} : (!transform.any_op) -> !transform.any_op
+    transform.test_print_remark_at_operand %1, "found test.foo parent" : !transform.any_op
 
-  // Get immediate parent.
-  %2 = transform.get_parent_op %0 : (!transform.any_op) -> !transform.any_op
-  test_print_remark_at_operand %2, "direct parent" : !transform.any_op
-  %p = num_associations %2 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{2}}
-  test_print_param %p : !transform.param<i64>
+    // Get immediate parent.
+    %2 = transform.get_parent_op %0 : (!transform.any_op) -> !transform.any_op
+    transform.test_print_remark_at_operand %2, "direct parent" : !transform.any_op
+    %p = transform.num_associations %2 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{2}}
+    transform.test_print_param %p : !transform.param<i64>
 
-  // Deduplicate results.
-  %3 = transform.structured.match ops{["test.qux"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %4 = transform.get_parent_op %3 {deduplicate} : (!transform.any_op) -> !transform.any_op
-  %p2 = num_associations %4 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{1}}
-  test_print_param %p2 : !transform.param<i64>
+    // Deduplicate results.
+    %3 = transform.structured.match ops{["test.qux"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %4 = transform.get_parent_op %3 {deduplicate} : (!transform.any_op) -> !transform.any_op
+    %p2 = transform.num_associations %4 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{1}}
+    transform.test_print_param %p2 : !transform.param<i64>
+    transform.yield
+  }
 }
 
 
 // -----
 
 // expected-note @below {{target op}}
-module {
-  transform.sequence  failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below{{could not find a parent op that matches all requirements}}
-    %3 = get_parent_op %arg0 {op_name = "builtin.module"} : (!transform.any_op) -> !transform.any_op
+    %3 = transform.get_parent_op %arg0 {op_name = "builtin.module"} : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
 
@@ -1967,29 +2165,34 @@ func.func @cast(%arg0: f32) -> f64 {
   return %0 : f64
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["arith.extf"]} in %arg0 : (!transform.any_op) -> !transform.op<"arith.extf">
-  %1 = transform.get_result %0[0] : (!transform.op<"arith.extf">) -> !transform.any_value
-  %2 = transform.get_type %1 : (!transform.any_value) -> !transform.type
-  transform.test_print_param %2 at %0 : !transform.type, !transform.op<"arith.extf">
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg0 : (!transform.any_op) -> !transform.op<"arith.extf">
+    %1 = transform.get_result %0[0] : (!transform.op<"arith.extf">) -> !transform.any_value
+    %2 = transform.get_type %1 : (!transform.any_value) -> !transform.type
+    transform.test_print_param %2 at %0 : !transform.type, !transform.op<"arith.extf">
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-error @below {{expected type attribute, got 0 : i32}}
-  transform.test_produce_param (0 : i32) : !transform.type
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-error @below {{expected type attribute, got 0 : i32}}
+    transform.test_produce_param (0 : i32) : !transform.type
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // expected-error @below {{expected affine map attribute, got 0 : i32}}
-  transform.test_produce_param (0 : i32) : !transform.affine_map
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // expected-error @below {{expected affine map attribute, got 0 : i32}}
+    transform.test_produce_param (0 : i32) : !transform.affine_map
+    transform.yield
+  }
 }
 
 // -----
@@ -1997,10 +2200,12 @@ transform.sequence failures(propagate) {
 // CHECK-LABEL: @type_param_anchor
 func.func private @type_param_anchor()
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // CHECK: test_produce_param(f32) : !transform.type
-  transform.test_produce_param(f32) : !transform.type
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // CHECK: test_produce_param(f32) : !transform.type
+    transform.test_produce_param(f32) : !transform.type
+    transform.yield
+  }
 }
 
 // -----
@@ -2008,10 +2213,12 @@ transform.sequence failures(propagate) {
 // CHECK-LABEL: @affine_map_param_anchor
 func.func private @affine_map_param_anchor()
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // CHECK: test_produce_param(#{{.*}}) : !transform.affine_map
-  transform.test_produce_param(affine_map<(d0) -> ()>) : !transform.affine_map
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // CHECK: test_produce_param(#{{.*}}) : !transform.affine_map
+    transform.test_produce_param(affine_map<(d0) -> ()>) : !transform.affine_map
+    transform.yield
+  }
 }
 
 // -----
@@ -2020,10 +2227,12 @@ func.func @verify_success(%arg0: f64) -> f64 {
   return %arg0 : f64
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.verify %0 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.verify %0 : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -2034,12 +2243,14 @@ func.func @verify_failure(%arg0: f64) -> f64 {
   return %arg0 : f64
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.test_produce_invalid_ir %0 : !transform.any_op
-  // expected-error @below{{failed to verify payload op}}
-  transform.verify %0 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.test_produce_invalid_ir %0 : !transform.any_op
+    // expected-error @below{{failed to verify payload op}}
+    transform.verify %0 : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -2054,22 +2265,24 @@ func.func @select() {
   func.return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  // Match all ops inside the function (including the function itself).
-  %func_op = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %0 = transform.structured.match in %func_op : (!transform.any_op) -> !transform.any_op
-  %p = num_associations %0 : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{5}}
-  test_print_param %p : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // Match all ops inside the function (including the function itself).
+    %func_op = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %0 = transform.structured.match in %func_op : (!transform.any_op) -> !transform.any_op
+    %p = transform.num_associations %0 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{5}}
+    transform.test_print_param %p : !transform.param<i64>
 
-  // Select "test.foo".
-  %foo = transform.select "test.foo" in %0 : (!transform.any_op) -> !transform.any_op
-  test_print_remark_at_operand %foo, "found foo" : !transform.any_op
+    // Select "test.foo".
+    %foo = transform.select "test.foo" in %0 : (!transform.any_op) -> !transform.any_op
+    transform.test_print_remark_at_operand %foo, "found foo" : !transform.any_op
 
-  // Select "test.bar".
-  %bar = transform.select "test.bar" in %0 : (!transform.any_op) -> !transform.any_op
-  test_print_remark_at_operand %bar, "found bar" : !transform.any_op
+    // Select "test.bar".
+    %bar = transform.select "test.bar" in %0 : (!transform.any_op) -> !transform.any_op
+    transform.test_print_remark_at_operand %bar, "found bar" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -2085,15 +2298,17 @@ func.func @apply_dce(%f: f32, %m: memref<5xf32>, %idx: index) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %func_op = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  %empty_op = transform.structured.match ops{["tensor.empty"]} in %func_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_dce to %func_op : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %empty_op = transform.structured.match ops{["tensor.empty"]} in %func_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_dce to %func_op : !transform.any_op
 
-  %p = num_associations %empty_op : (!transform.any_op) -> !transform.param<i64>
-  // expected-remark @below{{0}}
-  test_print_param %p : !transform.param<i64>
+    %p = transform.num_associations %empty_op : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below{{0}}
+    transform.test_print_param %p : !transform.param<i64>
+    transform.yield
+  }
 }
 
 
@@ -2110,27 +2325,26 @@ module @named_inclusion attributes { transform.with_named_sequence } {
 // Match `arith.constant`s that are not nested under a `scf.for` and ensure
 // there are none in the program
 
-transform.named_sequence @print(%root: !transform.any_op {transform.readonly}) {
-  transform.test_print_remark_at_operand %root, "matched func" : !transform.any_op
-  transform.yield 
-}
+  transform.named_sequence @print(%root: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %root, "matched func" : !transform.any_op
+    transform.yield
+  }
 
-transform.named_sequence @match_constant_not_under_scf_for(%root: !transform.any_op {transform.readonly}) 
-  -> !transform.any_op {
-  transform.match.operation_name %root ["arith.constant"] : !transform.any_op
-  %for = transform.get_parent_op %root { op_name = "scf.for", allow_empty_results }
-    : (!transform.any_op) -> (!transform.any_op)
-  transform.match.operation_empty %for : !transform.any_op
-  transform.yield %root : !transform.any_op
-}
+  transform.named_sequence @match_constant_not_under_scf_for(%root: !transform.any_op {transform.readonly})
+    -> !transform.any_op {
+    transform.match.operation_name %root ["arith.constant"] : !transform.any_op
+    %for = transform.get_parent_op %root { op_name = "scf.for", allow_empty_results }
+      : (!transform.any_op) -> (!transform.any_op)
+    transform.match.operation_empty %for : !transform.any_op
+    transform.yield %root : !transform.any_op
+  }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.foreach_match in %arg0
-      @match_constant_not_under_scf_for -> @print
-    : (!transform.any_op) -> (!transform.any_op)
-  transform.yield 
-}
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.foreach_match in %arg0
+        @match_constant_not_under_scf_for -> @print
+      : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  }
 }
 
 // -----
@@ -2142,28 +2356,27 @@ func.func @no_constant_under_loop(%lb: index, %ub: index, %step: index) {
 }
 
 module @named_inclusion attributes { transform.with_named_sequence } {
-// Match `arith.constant`s that are not nested under a `scf.for` and ensure
-// there are none in the program
+  // Match `arith.constant`s that are not nested under a `scf.for` and ensure
+  // there are none in the program
 
-transform.named_sequence @print(%root: !transform.any_op {transform.readonly}) {
-  transform.test_print_remark_at_operand %root, "no parent scf.for" : !transform.any_op
-  transform.yield 
-}
+  transform.named_sequence @print(%root: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %root, "no parent scf.for" : !transform.any_op
+    transform.yield
+  }
 
-transform.named_sequence @match_constant_not_under_scf_for(%root: !transform.any_op {transform.readonly}) 
-  -> !transform.any_op {
-  transform.match.operation_name %root ["arith.constant"] : !transform.any_op
-  %for = transform.get_parent_op %root { op_name = "scf.for", allow_empty_results }
-    : (!transform.any_op) -> (!transform.any_op)
-  transform.match.operation_empty %for : !transform.any_op
-  transform.yield %root : !transform.any_op
-}
+  transform.named_sequence @match_constant_not_under_scf_for(%root: !transform.any_op {transform.readonly})
+    -> !transform.any_op {
+    transform.match.operation_name %root ["arith.constant"] : !transform.any_op
+    %for = transform.get_parent_op %root { op_name = "scf.for", allow_empty_results }
+      : (!transform.any_op) -> (!transform.any_op)
+    transform.match.operation_empty %for : !transform.any_op
+    transform.yield %root : !transform.any_op
+  }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.foreach_match in %arg0
-      @match_constant_not_under_scf_for -> @print
-    : (!transform.any_op) -> (!transform.any_op)
-  transform.yield 
-}
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.foreach_match in %arg0
+        @match_constant_not_under_scf_for -> @print
+      : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  }
 }

From 2ab5c47c8752b444885d6bfaf6f570a482fb4cdf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 4 Jan 2024 20:39:44 +0000
Subject: [PATCH 278/313] [VPlan] Don't replace scalarizing recipe with
 VPWidenCastRecipe.

Don't replace a scalarizing recipe with a VPWidenCastRecipe. This would
introduce wide (vectorizing) recipes when interleaving only.

Fixes https://github.com/llvm/llvm-project/issues/76986
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 23 ++++++----
 .../interleave-and-scalarize-only.ll          | 45 +++++++++++++++++++
 2 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 33132880d5a44..5c430620a2dcd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -829,15 +829,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     Type *ATy = TypeInfo.inferScalarType(A);
     if (TruncTy == ATy) {
       Trunc->replaceAllUsesWith(A);
-    } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
-      auto *VPC =
-          new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
-      VPC->insertBefore(&R);
-      Trunc->replaceAllUsesWith(VPC);
-    } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
-      auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
-      VPC->insertBefore(&R);
-      Trunc->replaceAllUsesWith(VPC);
+    } else {
+      // Don't replace a scalarizing recipe with a widened cast.
+      if (isa<VPReplicateRecipe>(&R))
+        break;
+      if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+        auto *VPC =
+            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+        VPC->insertBefore(&R);
+        Trunc->replaceAllUsesWith(VPC);
+      } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+        auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+        VPC->insertBefore(&R);
+        Trunc->replaceAllUsesWith(VPC);
+      }
     }
 #ifndef NDEBUG
     // Verify that the cached type info is for both A and its users is still
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 7b9d7f7986396..297cd2a7c12f9 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -310,3 +310,48 @@ loop:
 exit:
   ret void
 }
+
+define void @pr76986_trunc_sext_interleaving_only(i16 %arg, ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @pr76986_trunc_sext_interleaving_only(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr %src, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr %src, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP6]] to i16
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP7]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = sdiv i16 [[TMP8]], %arg
+; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[TMP9]], %arg
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr %dst, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr %dst, i64 [[TMP1]]
+; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP12]], align 2
+; CHECK-NEXT:    store i16 [[TMP11]], ptr [[TMP13]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 14934
+; CHECK-NEXT:    br i1 [[TMP14]], label %middle.block, label %vector.body
+;
+bb:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %bb ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv
+  %l = load i8, ptr %gep.src
+  %sext = sext i8 %l to i32
+  %trunc = trunc i32 %sext to i16
+  %sdiv = sdiv i16 %trunc, %arg
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sdiv, ptr %gep.dst
+  %iv.next = add i64 %iv, 1
+  %icmp = icmp ult i64 %iv, 14933
+  br i1 %icmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+

From 58f1640635feff282935153d295dfb4ea1818401 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Thu, 4 Jan 2024 15:40:30 -0500
Subject: [PATCH 279/313] [RISCV][llvm-mca] Use correct LMUL and SEW for
 strided loads and stores (#76869)

The pseudos for strided loads and stores use the SEW coming from the
name. For example, vlse8 has SEW=8 and vlse16 has SEW=16.

When llvm-mca tries to lookup (VLSE8_V, SEW=S, LMUL=L) in the inverse
pseudo table, a result will only be found when S=8, where S was set from
the previous vsetvli instruction. Instead, for a match to be found, we
must lookup (VLSE8_V, SEW=8, LMUL=L') where L' is the EMUL which was
calculated by scaling the LMUL and SEW from the previous vsetvli and the
SEW=8.
---
 .../Target/RISCV/MCA/RISCVCustomBehaviour.cpp |  33 +-
 .../RISCV/SiFive7/strided-load-store.s        | 361 ++++++++++++++++++
 2 files changed, 385 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-store.s

diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index aba2511959af0..8d97c5ffd20a0 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -186,30 +186,37 @@ RISCVInstrumentManager::createInstruments(const MCInst &Inst) {
 }
 
 static std::pair<uint8_t, uint8_t>
-getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL,
-                                    uint8_t SEW) {
+getEEWAndEMUL(unsigned Opcode, RISCVII::VLMUL LMUL, uint8_t SEW) {
   uint8_t EEW;
   switch (Opcode) {
   case RISCV::VLM_V:
   case RISCV::VSM_V:
   case RISCV::VLE8_V:
   case RISCV::VSE8_V:
+  case RISCV::VLSE8_V:
+  case RISCV::VSSE8_V:
     EEW = 8;
     break;
   case RISCV::VLE16_V:
   case RISCV::VSE16_V:
+  case RISCV::VLSE16_V:
+  case RISCV::VSSE16_V:
     EEW = 16;
     break;
   case RISCV::VLE32_V:
   case RISCV::VSE32_V:
+  case RISCV::VLSE32_V:
+  case RISCV::VSSE32_V:
     EEW = 32;
     break;
   case RISCV::VLE64_V:
   case RISCV::VSE64_V:
+  case RISCV::VLSE64_V:
+  case RISCV::VSSE64_V:
     EEW = 64;
     break;
   default:
-    llvm_unreachable("Opcode is not a vector unit stride load nor store");
+    llvm_unreachable("Could not determine EEW from Opcode");
   }
 
   auto EMUL = RISCVVType::getSameRatioLMUL(SEW, LMUL, EEW);
@@ -218,6 +225,18 @@ getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL,
   return std::make_pair(EEW, *EMUL);
 }
 
+bool opcodeHasEEWAndEMULInfo(unsigned short Opcode) {
+  return Opcode == RISCV::VLM_V || Opcode == RISCV::VSM_V ||
+         Opcode == RISCV::VLE8_V || Opcode == RISCV::VSE8_V ||
+         Opcode == RISCV::VLE16_V || Opcode == RISCV::VSE16_V ||
+         Opcode == RISCV::VLE32_V || Opcode == RISCV::VSE32_V ||
+         Opcode == RISCV::VLE64_V || Opcode == RISCV::VSE64_V ||
+         Opcode == RISCV::VLSE8_V || Opcode == RISCV::VSSE8_V ||
+         Opcode == RISCV::VLSE16_V || Opcode == RISCV::VSSE16_V ||
+         Opcode == RISCV::VLSE32_V || Opcode == RISCV::VSSE32_V ||
+         Opcode == RISCV::VLSE64_V || Opcode == RISCV::VSSE64_V;
+}
+
 unsigned RISCVInstrumentManager::getSchedClassID(
     const MCInstrInfo &MCII, const MCInst &MCI,
     const llvm::SmallVector<Instrument *> &IVec) const {
@@ -249,13 +268,9 @@ unsigned RISCVInstrumentManager::getSchedClassID(
   uint8_t SEW = SI ? SI->getSEW() : 0;
 
   const RISCVVInversePseudosTable::PseudoInfo *RVV = nullptr;
-  if (Opcode == RISCV::VLM_V || Opcode == RISCV::VSM_V ||
-      Opcode == RISCV::VLE8_V || Opcode == RISCV::VSE8_V ||
-      Opcode == RISCV::VLE16_V || Opcode == RISCV::VSE16_V ||
-      Opcode == RISCV::VLE32_V || Opcode == RISCV::VSE32_V ||
-      Opcode == RISCV::VLE64_V || Opcode == RISCV::VSE64_V) {
+  if (opcodeHasEEWAndEMULInfo(Opcode)) {
     RISCVII::VLMUL VLMUL = static_cast<RISCVII::VLMUL>(LMUL);
-    auto [EEW, EMUL] = getEEWAndEMULForUnitStrideLoadStore(Opcode, VLMUL, SEW);
+    auto [EEW, EMUL] = getEEWAndEMUL(Opcode, VLMUL, SEW);
     RVV = RISCVVInversePseudosTable::getBaseInfo(Opcode, EMUL, EEW);
   } else {
     // Check if it depends on LMUL and SEW
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-store.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-store.s
new file mode 100644
index 0000000000000..d60922b064627
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-store.s
@@ -0,0 +1,361 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e8, mf4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e8, mf2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e8, m1, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e8, m2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+
+vsetvli zero, zero, e8, m4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+
+vsetvli zero, zero, e8, m8, tu, mu
+vlse8.v  v1, (a1), a2
+
+vsetvli zero, zero, e16, mf4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e16, mf2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e16, m1, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e16, m2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e16, m4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+
+vsetvli zero, zero, e16, m8, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e32, m1, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e32, m2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e32, m4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e32, m8, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+
+vsetvli zero, zero, e64, m1, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e64, m2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e64, m4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e64, m8, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      100
+# CHECK-NEXT: Total Cycles:      4734
+# CHECK-NEXT: Total uOps:        100
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.02
+# CHECK-NEXT: IPC:               0.02
+# CHECK-NEXT: Block RThroughput: 4686.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      11    9.00    *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      19    17.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      35    33.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      67    65.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      131   129.00  *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   129.00  *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   129.00  *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      259   257.00  *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   257.00  *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      515   513.00  *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      11    9.00    *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      19    17.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      35    33.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      67    65.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      131   129.00  *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   129.00  *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   129.00  *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      259   257.00  *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   257.00  *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      11    9.00    *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      19    17.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      35    33.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      67    65.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      131   129.00  *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   129.00  *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   129.00  *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      11    9.00    *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      19    17.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      35    33.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    33.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      67    65.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    65.00   *                   vlse64.v	v1, (a1), a2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7VA
+# CHECK-NEXT: [5]   - SiFive7VCQ
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     22.00   -      -     78.00  4686.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   257.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   513.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   257.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2

From a0c19bd45599c044a662752c2b513cc514cb645a Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Thu, 4 Jan 2024 14:43:52 -0600
Subject: [PATCH 280/313] [mlir][RegionBranchOpInterface] explicitly check for
 existance of block terminator (#76831)

---
 mlir/lib/Interfaces/ControlFlowInterfaces.cpp |  7 +--
 .../IR/test-region-branch-op-verifier.mlir    | 15 +++++-
 mlir/test/lib/Dialect/Test/TestDialect.cpp    | 50 +++++++++++++++++++
 mlir/test/lib/Dialect/Test/TestOps.td         | 12 +++++
 4 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index 4ed024ddae247..a563ec5cb8db5 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -177,9 +177,10 @@ LogicalResult detail::verifyTypesAlongControlFlowEdges(Operation *op) {
 
     SmallVector<RegionBranchTerminatorOpInterface> regionReturnOps;
     for (Block &block : region)
-      if (auto terminator = dyn_cast<RegionBranchTerminatorOpInterface>(
-              block.getTerminator()))
-        regionReturnOps.push_back(terminator);
+      if (!block.empty())
+        if (auto terminator =
+                dyn_cast<RegionBranchTerminatorOpInterface>(block.back()))
+          regionReturnOps.push_back(terminator);
 
     // If there is no return-like terminator, the op itself should verify
     // type consistency.
diff --git a/mlir/test/IR/test-region-branch-op-verifier.mlir b/mlir/test/IR/test-region-branch-op-verifier.mlir
index f5fb7fc2b25cb..b94f6beb9796f 100644
--- a/mlir/test/IR/test-region-branch-op-verifier.mlir
+++ b/mlir/test/IR/test-region-branch-op-verifier.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s
+// RUN: mlir-opt %s -split-input-file
 
 func.func @test_ops_verify(%arg: i32) -> f32 {
   %0 = "test.constant"() { value = 5.3 : f32 } : () -> f32
@@ -8,3 +8,16 @@ func.func @test_ops_verify(%arg: i32) -> f32 {
   }
   return %1 : f32
 }
+
+// -----
+
+func.func @test_no_terminator(%arg: index) {
+  test.switch_with_no_break %arg
+  case 0 {
+  ^bb:
+  }
+  case 1 {
+  ^bb:
+  }
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index a1b30705f16a9..cb5ee6014b611 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -53,6 +53,7 @@ using namespace test;
 Attribute MyPropStruct::asAttribute(MLIRContext *ctx) const {
   return StringAttr::get(ctx, content);
 }
+
 LogicalResult
 MyPropStruct::setFromAttr(MyPropStruct &prop, Attribute attr,
                           function_ref<InFlightDiagnostic()> emitError) {
@@ -64,6 +65,7 @@ MyPropStruct::setFromAttr(MyPropStruct &prop, Attribute attr,
   prop.content = strAttr.getValue();
   return success();
 }
+
 llvm::hash_code MyPropStruct::hash() const {
   return hash_value(StringRef(content));
 }
@@ -127,6 +129,12 @@ static void customPrintProperties(OpAsmPrinter &p,
                                   const VersionedProperties &prop);
 static ParseResult customParseProperties(OpAsmParser &parser,
                                          VersionedProperties &prop);
+static ParseResult
+parseSwitchCases(OpAsmParser &p, DenseI64ArrayAttr &cases,
+                 SmallVectorImpl<std::unique_ptr<Region>> &caseRegions);
+
+static void printSwitchCases(OpAsmPrinter &p, Operation *op,
+                             DenseI64ArrayAttr cases, RegionRange caseRegions);
 
 void test::registerTestDialect(DialectRegistry &registry) {
   registry.insert<TestDialect>();
@@ -230,6 +238,7 @@ void TestDialect::initialize() {
   // unregistered op.
   fallbackEffectOpInterfaces = new TestOpEffectInterfaceFallback;
 }
+
 TestDialect::~TestDialect() {
   delete static_cast<TestOpEffectInterfaceFallback *>(
       fallbackEffectOpInterfaces);
@@ -1013,6 +1022,13 @@ LoopBlockTerminatorOp::getMutableSuccessorOperands(RegionBranchPoint point) {
   return getNextIterArgMutable();
 }
 
+//===----------------------------------------------------------------------===//
+// SwitchWithNoBreakOp
+//===----------------------------------------------------------------------===//
+
+void TestNoTerminatorOp::getSuccessorRegions(
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {}
+
 //===----------------------------------------------------------------------===//
 // SingleNoTerminatorCustomAsmOp
 //===----------------------------------------------------------------------===//
@@ -1160,6 +1176,7 @@ setPropertiesFromAttribute(PropertiesWithCustomPrint &prop, Attribute attr,
   prop.value = valueAttr.getValue().getSExtValue();
   return success();
 }
+
 static DictionaryAttr
 getPropertiesAsAttribute(MLIRContext *ctx,
                          const PropertiesWithCustomPrint &prop) {
@@ -1169,14 +1186,17 @@ getPropertiesAsAttribute(MLIRContext *ctx,
   attrs.push_back(b.getNamedAttr("value", b.getI32IntegerAttr(prop.value)));
   return b.getDictionaryAttr(attrs);
 }
+
 static llvm::hash_code computeHash(const PropertiesWithCustomPrint &prop) {
   return llvm::hash_combine(prop.value, StringRef(*prop.label));
 }
+
 static void customPrintProperties(OpAsmPrinter &p,
                                   const PropertiesWithCustomPrint &prop) {
   p.printKeywordOrString(*prop.label);
   p << " is " << prop.value;
 }
+
 static ParseResult customParseProperties(OpAsmParser &parser,
                                          PropertiesWithCustomPrint &prop) {
   std::string label;
@@ -1186,6 +1206,31 @@ static ParseResult customParseProperties(OpAsmParser &parser,
   prop.label = std::make_shared<std::string>(std::move(label));
   return success();
 }
+
+static ParseResult
+parseSwitchCases(OpAsmParser &p, DenseI64ArrayAttr &cases,
+                 SmallVectorImpl<std::unique_ptr<Region>> &caseRegions) {
+  SmallVector<int64_t> caseValues;
+  while (succeeded(p.parseOptionalKeyword("case"))) {
+    int64_t value;
+    Region &region = *caseRegions.emplace_back(std::make_unique<Region>());
+    if (p.parseInteger(value) || p.parseRegion(region, /*arguments=*/{}))
+      return failure();
+    caseValues.push_back(value);
+  }
+  cases = p.getBuilder().getDenseI64ArrayAttr(caseValues);
+  return success();
+}
+
+static void printSwitchCases(OpAsmPrinter &p, Operation *op,
+                             DenseI64ArrayAttr cases, RegionRange caseRegions) {
+  for (auto [value, region] : llvm::zip(cases.asArrayRef(), caseRegions)) {
+    p.printNewline();
+    p << "case " << value << ' ';
+    p.printRegion(*region, /*printEntryBlockArgs=*/false);
+  }
+}
+
 static LogicalResult
 setPropertiesFromAttribute(VersionedProperties &prop, Attribute attr,
                            function_ref<InFlightDiagnostic()> emitError) {
@@ -1209,6 +1254,7 @@ setPropertiesFromAttribute(VersionedProperties &prop, Attribute attr,
   prop.value2 = value2Attr.getValue().getSExtValue();
   return success();
 }
+
 static DictionaryAttr
 getPropertiesAsAttribute(MLIRContext *ctx, const VersionedProperties &prop) {
   SmallVector<NamedAttribute> attrs;
@@ -1217,13 +1263,16 @@ getPropertiesAsAttribute(MLIRContext *ctx, const VersionedProperties &prop) {
   attrs.push_back(b.getNamedAttr("value2", b.getI32IntegerAttr(prop.value2)));
   return b.getDictionaryAttr(attrs);
 }
+
 static llvm::hash_code computeHash(const VersionedProperties &prop) {
   return llvm::hash_combine(prop.value1, prop.value2);
 }
+
 static void customPrintProperties(OpAsmPrinter &p,
                                   const VersionedProperties &prop) {
   p << prop.value1 << " | " << prop.value2;
 }
+
 static ParseResult customParseProperties(OpAsmParser &parser,
                                          VersionedProperties &prop) {
   if (parser.parseInteger(prop.value1) || parser.parseVerticalBar() ||
@@ -1393,6 +1442,7 @@ ::mlir::LogicalResult TestOpWithVersionedProperties::readFromMlirBytecode(
   prop.value2 = value2;
   return success();
 }
+
 void TestOpWithVersionedProperties::writeToMlirBytecode(
     ::mlir::DialectBytecodeWriter &writer,
     const test::VersionedProperties &prop) {
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 48b41d8698762..61e181999ff2f 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2213,6 +2213,18 @@ def LoopBlockTerminatorOp : TEST_Op<"loop_block_term",
   }];
 }
 
+def TestNoTerminatorOp : TEST_Op<"switch_with_no_break", [
+    NoTerminator,
+    DeclareOpInterfaceMethods<RegionBranchOpInterface, ["getSuccessorRegions"]>
+  ]> {
+  let arguments = (ins Index:$arg, DenseI64ArrayAttr:$cases);
+  let regions = (region VariadicRegion<SizedRegion<1>>:$caseRegions);
+
+  let assemblyFormat = [{
+    $arg attr-dict custom<SwitchCases>($cases, $caseRegions)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Test TableGen generated build() methods
 //===----------------------------------------------------------------------===//

From 03ef103235752830da7b9ce5e825c0e3ddf7f45a Mon Sep 17 00:00:00 2001
From: Chris Cotter <ccotter14@bloomberg.net>
Date: Thu, 4 Jan 2024 15:47:14 -0500
Subject: [PATCH 281/313] [clang-tidy] Fix bug in modernize-use-emplace
 (#66169)

emplace_back cannot construct an aggregate with arguments used to
initialize the aggregate.
Closes #62387

Test plan: Added test test from #62387 which contains code that should
not be replaced by the check.
---
 .../clang-tidy/modernize/UseEmplaceCheck.cpp     | 16 +++++++++++-----
 clang-tools-extra/docs/ReleaseNotes.rst          |  4 ++++
 .../checkers/modernize/use-emplace.cpp           | 13 +++++++++++++
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
index 4438f0b22063f..07e296cc26528 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
@@ -13,6 +13,10 @@ using namespace clang::ast_matchers;
 namespace clang::tidy::modernize {
 
 namespace {
+AST_MATCHER_P(InitListExpr, initCountLeq, unsigned, N) {
+  return Node.getNumInits() <= N;
+}
+
 // Identical to hasAnyName, except it does not take template specifiers into
 // account. This is used to match the functions names as in
 // DefaultEmplacyFunctions below without caring about the template types of the
@@ -205,11 +209,13 @@ void UseEmplaceCheck::registerMatchers(MatchFinder *Finder) {
   auto HasConstructExpr = has(ignoringImplicit(SoughtConstructExpr));
 
   // allow for T{} to be replaced, even if no CTOR is declared
-  auto HasConstructInitListExpr = has(initListExpr(anyOf(
-      allOf(has(SoughtConstructExpr),
-            has(cxxConstructExpr(argumentCountIs(0)))),
-      has(cxxBindTemporaryExpr(has(SoughtConstructExpr),
-                               has(cxxConstructExpr(argumentCountIs(0))))))));
+  auto HasConstructInitListExpr =
+      has(initListExpr(initCountLeq(1),
+                       anyOf(allOf(has(SoughtConstructExpr),
+                                   has(cxxConstructExpr(argumentCountIs(0)))),
+                             has(cxxBindTemporaryExpr(
+                                 has(SoughtConstructExpr),
+                                 has(cxxConstructExpr(argumentCountIs(0))))))));
   auto HasBracedInitListExpr =
       anyOf(has(cxxBindTemporaryExpr(HasConstructInitListExpr)),
             HasConstructInitListExpr);
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 571808a51596a..4d25e2ebe85f5 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -394,6 +394,10 @@ Changes in existing checks
   false-positives when constructing the container with ``count`` copies of
   elements with value ``value``.
 
+- Improved :doc:`modernize-use-emplace
+  <clang-tidy/checks/modernize/use-emplace>` to not replace aggregates that
+  ``emplace`` cannot construct with aggregate initialization.
+
 - Improved :doc:`modernize-use-equals-delete
   <clang-tidy/checks/modernize/use-equals-delete>` check to ignore
   false-positives when special member function is actually used or implicit.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
index fead2b6151d02..f7b1ad55f5df5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
@@ -1183,6 +1183,11 @@ struct NonTrivialWithVector {
   std::vector<int> it;
 };
 
+struct NonTrivialWithIntAndVector {
+  int x;
+  std::vector<int> it;
+};
+
 struct NonTrivialWithCtor {
   NonTrivialWithCtor();
   NonTrivialWithCtor(std::vector<int> const&);
@@ -1332,6 +1337,14 @@ void testBracedInitTemporaries() {
   v3.push_back(NonTrivialWithCtor{{}});
   v3.push_back({{0}});
   v3.push_back({{}});
+
+  std::vector<NonTrivialWithIntAndVector> v4;
+
+  // These should not be noticed or fixed; after the correction, the code won't
+  // compile.
+  v4.push_back(NonTrivialWithIntAndVector{1, {}});
+  v4.push_back(NonTrivialWithIntAndVector{});
+  v4.push_back({});
 }
 
 void testWithPointerTypes() {

From d67c2d85548437ef8c3bb12a29ea330180b87913 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Thu, 4 Jan 2024 18:20:40 +0000
Subject: [PATCH 282/313] [clang-tidy][NFC] Code format - leftover after #66169

Manually formating code via clang-format after previous
commit merge.
---
 clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
index 07e296cc26528..430455a38f395 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
@@ -209,9 +209,8 @@ void UseEmplaceCheck::registerMatchers(MatchFinder *Finder) {
   auto HasConstructExpr = has(ignoringImplicit(SoughtConstructExpr));
 
   // allow for T{} to be replaced, even if no CTOR is declared
-  auto HasConstructInitListExpr =
-      has(initListExpr(initCountLeq(1),
-                       anyOf(allOf(has(SoughtConstructExpr),
+  auto HasConstructInitListExpr = has(initListExpr(
+      initCountLeq(1), anyOf(allOf(has(SoughtConstructExpr),
                                    has(cxxConstructExpr(argumentCountIs(0)))),
                              has(cxxBindTemporaryExpr(
                                  has(SoughtConstructExpr),

From 2bc994456c5be2ab6d98b94de2349302577a9823 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Thu, 4 Jan 2024 15:51:14 -0500
Subject: [PATCH 283/313] [libc] major refactor of startup library (#76092)

* separate initialization routines into _start and do_start for all
architectures.
* lift do_start as a separate object library to avoid code duplication.
* (addtionally) address the problem of building hermetic libc with
-fstack-pointer-*

The `crt1.o` is now a merged result of three components:

```
___
  |___ x86_64
  |      |_______ start.cpp.o    <- _start (loads process initial stack and aligns stack pointer)
  |      |_______ tls.cpp.o      <- init_tls, cleanup_tls, set_thread_pointer (TLS related routines)
  |___ do_start.cpp.o            <- do_start (sets up global variables and invokes the main function)
```
---
 libc/cmake/modules/LLVMLibCTestRules.cmake |   6 +
 libc/config/linux/app.h                    |   3 +
 libc/startup/linux/CMakeLists.txt          |  25 ++-
 libc/startup/linux/aarch64/CMakeLists.txt  |  19 +-
 libc/startup/linux/aarch64/start.cpp       | 215 +------------------
 libc/startup/linux/aarch64/tls.cpp         |  86 ++++++++
 libc/startup/linux/do_start.cpp            | 140 ++++++++++++
 libc/startup/linux/do_start.h              |  14 ++
 libc/startup/linux/riscv/CMakeLists.txt    |  20 +-
 libc/startup/linux/riscv/start.cpp         | 225 +------------------
 libc/startup/linux/riscv/tls.cpp           |  74 +++++++
 libc/startup/linux/x86_64/CMakeLists.txt   |  26 ++-
 libc/startup/linux/x86_64/start.cpp        | 237 ++-------------------
 libc/startup/linux/x86_64/tls.cpp          |  93 ++++++++
 14 files changed, 512 insertions(+), 671 deletions(-)
 create mode 100644 libc/startup/linux/aarch64/tls.cpp
 create mode 100644 libc/startup/linux/do_start.cpp
 create mode 100644 libc/startup/linux/do_start.h
 create mode 100644 libc/startup/linux/riscv/tls.cpp
 create mode 100644 libc/startup/linux/x86_64/tls.cpp

diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 51d484b875aef..b69839afebf8a 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -498,6 +498,9 @@ function(add_integration_test test_name)
       libc.src.string.memcpy
       libc.src.string.memmove
       libc.src.string.memset
+      # __stack_chk_fail should always be included to allow building libc with
+      # stack protector.
+      libc.src.compiler.__stack_chk_fail
   )
   list(REMOVE_DUPLICATES fq_deps_list)
 
@@ -665,6 +668,9 @@ function(add_libc_hermetic_test test_name)
       libc.src.string.memmove
       libc.src.string.memset
       libc.src.__support.StringUtil.error_to_string
+      # __stack_chk_fail should always be included to allow building libc with
+      # stack protector.
+      libc.src.compiler.__stack_chk_fail
   )
 
   if(TARGET libc.src.time.clock)
diff --git a/libc/config/linux/app.h b/libc/config/linux/app.h
index 548c141fd7053..1b3523deb1b23 100644
--- a/libc/config/linux/app.h
+++ b/libc/config/linux/app.h
@@ -119,6 +119,9 @@ void init_tls(TLSDescriptor &tls);
 // Cleanup the TLS area as described in |tls_descriptor|.
 void cleanup_tls(uintptr_t tls_addr, uintptr_t tls_size);
 
+// Set the thread pointer for the current thread.
+bool set_thread_ptr(uintptr_t val);
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_CONFIG_LINUX_APP_H
diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt
index 2d55a36566971..39bcca9cdba9f 100644
--- a/libc/startup/linux/CMakeLists.txt
+++ b/libc/startup/linux/CMakeLists.txt
@@ -84,10 +84,33 @@ endif()
 
 add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
 
+add_object_library(
+  do_start
+  SRCS
+    do_start.cpp
+  HDRS
+    do_start.h
+  DEPENDS
+    libc.config.linux.app_h
+    libc.include.sys_mman
+    libc.include.sys_syscall
+    libc.src.__support.threads.thread
+    libc.src.__support.OSUtil.osutil
+    libc.src.stdlib.exit
+    libc.src.stdlib.atexit
+    libc.src.unistd.environ
+  COMPILE_OPTIONS
+    -ffreestanding       # To avoid compiler warnings about calling the main function.
+    -fno-builtin         # avoid emit unexpected calls
+    -fno-stack-protector # stack protect canary is not available yet.
+)
+
 # TODO: factor out crt1 into multiple objects
 merge_relocatable_object(
   crt1
-  .${LIBC_TARGET_ARCHITECTURE}.crt1
+  .${LIBC_TARGET_ARCHITECTURE}.start
+  .${LIBC_TARGET_ARCHITECTURE}.tls
+  .do_start
 )
 
 add_startup_object(
diff --git a/libc/startup/linux/aarch64/CMakeLists.txt b/libc/startup/linux/aarch64/CMakeLists.txt
index b47db8eb5d23f..5ea6ae59abcb2 100644
--- a/libc/startup/linux/aarch64/CMakeLists.txt
+++ b/libc/startup/linux/aarch64/CMakeLists.txt
@@ -1,17 +1,24 @@
 add_startup_object(
-  crt1
+  tls
   SRC
-    start.cpp
+    tls.cpp
   DEPENDS
     libc.config.linux.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
-    libc.src.__support.threads.thread
     libc.src.__support.OSUtil.osutil
-    libc.src.stdlib.exit
-    libc.src.stdlib.atexit
     libc.src.string.memory_utils.inline_memcpy
-    libc.src.unistd.environ
+  COMPILE_OPTIONS
+    -fno-omit-frame-pointer
+    -ffreestanding # To avoid compiler warnings about calling the main function.
+)
+
+add_startup_object(
+  start
+  SRC
+    start.cpp
+  DEPENDS
+    libc.config.linux.app_h
   COMPILE_OPTIONS
     -fno-omit-frame-pointer
     -ffreestanding # To avoid compiler warnings about calling the main function.
diff --git a/libc/startup/linux/aarch64/start.cpp b/libc/startup/linux/aarch64/start.cpp
index bc01582aeb49c..d0a8526873390 100644
--- a/libc/startup/linux/aarch64/start.cpp
+++ b/libc/startup/linux/aarch64/start.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of crt for aarch64 ---------------------------------===//
+//===-- Implementation of _start for aarch64 ------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,213 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "config/linux/app.h"
-#include "src/__support/OSUtil/syscall.h"
-#include "src/__support/threads/thread.h"
-#include "src/stdlib/atexit.h"
-#include "src/stdlib/exit.h"
-#include "src/string/memory_utils/inline_memcpy.h"
-
-#include <arm_acle.h>
-
-#include <linux/auxvec.h>
-#include <linux/elf.h>
-#include <stdint.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-
-extern "C" int main(int, char **, char **);
-
-// Source documentation:
-// https://github.com/ARM-software/abi-aa/tree/main/sysvabi64
-
-namespace LIBC_NAMESPACE {
-
-#ifdef SYS_mmap2
-static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap2;
-#elif SYS_mmap
-static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap;
-#else
-#error "mmap and mmap2 syscalls not available."
-#endif
-
-AppProperties app;
-
-static ThreadAttributes main_thread_attrib;
-
-void init_tls(TLSDescriptor &tls_descriptor) {
-  if (app.tls.size == 0) {
-    tls_descriptor.size = 0;
-    tls_descriptor.tp = 0;
-    return;
-  }
-
-  // aarch64 follows the variant 1 TLS layout:
-  //
-  // 1. First entry is the dynamic thread vector pointer
-  // 2. Second entry is a 8-byte reserved word.
-  // 3. Padding for alignment.
-  // 4. The TLS data from the ELF image.
-  //
-  // The thread pointer points to the first entry.
-
-  const uintptr_t size_of_pointers = 2 * sizeof(uintptr_t);
-  uintptr_t padding = 0;
-  const uintptr_t ALIGNMENT_MASK = app.tls.align - 1;
-  uintptr_t diff = size_of_pointers & ALIGNMENT_MASK;
-  if (diff != 0)
-    padding += (ALIGNMENT_MASK - diff) + 1;
-
-  uintptr_t alloc_size = size_of_pointers + padding + app.tls.size;
-
-  // We cannot call the mmap function here as the functions set errno on
-  // failure. Since errno is implemented via a thread local variable, we cannot
-  // use errno before TLS is setup.
-  long mmap_ret_val = LIBC_NAMESPACE::syscall_impl<long>(
-      MMAP_SYSCALL_NUMBER, nullptr, alloc_size, PROT_READ | PROT_WRITE,
-      MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  // We cannot check the return value with MAP_FAILED as that is the return
-  // of the mmap function and not the mmap syscall.
-  if (mmap_ret_val < 0 && static_cast<uintptr_t>(mmap_ret_val) > -app.page_size)
-    LIBC_NAMESPACE::syscall_impl<long>(SYS_exit, 1);
-  uintptr_t thread_ptr = uintptr_t(reinterpret_cast<uintptr_t *>(mmap_ret_val));
-  uintptr_t tls_addr = thread_ptr + size_of_pointers + padding;
-  LIBC_NAMESPACE::inline_memcpy(reinterpret_cast<char *>(tls_addr),
-                                reinterpret_cast<const char *>(app.tls.address),
-                                app.tls.init_size);
-  tls_descriptor.size = alloc_size;
-  tls_descriptor.addr = thread_ptr;
-  tls_descriptor.tp = thread_ptr;
-}
-
-void cleanup_tls(uintptr_t addr, uintptr_t size) {
-  if (size == 0)
-    return;
-  LIBC_NAMESPACE::syscall_impl<long>(SYS_munmap, addr, size);
-}
-
-static void set_thread_ptr(uintptr_t val) { __arm_wsr64("tpidr_el0", val); }
-
-using InitCallback = void(int, char **, char **);
-using FiniCallback = void(void);
-
-extern "C" {
-// These arrays are present in the .init_array and .fini_array sections.
-// The symbols are inserted by linker when it sees references to them.
-extern uintptr_t __preinit_array_start[];
-extern uintptr_t __preinit_array_end[];
-extern uintptr_t __init_array_start[];
-extern uintptr_t __init_array_end[];
-extern uintptr_t __fini_array_start[];
-extern uintptr_t __fini_array_end[];
-}
-
-static void call_init_array_callbacks(int argc, char **argv, char **env) {
-  size_t preinit_array_size = __preinit_array_end - __preinit_array_start;
-  for (size_t i = 0; i < preinit_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__preinit_array_start[i])(argc, argv, env);
-  size_t init_array_size = __init_array_end - __init_array_start;
-  for (size_t i = 0; i < init_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
-}
-
-static void call_fini_array_callbacks() {
-  size_t fini_array_size = __fini_array_end - __fini_array_start;
-  for (size_t i = fini_array_size; i > 0; --i)
-    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
-}
-
-} // namespace LIBC_NAMESPACE
-
-using LIBC_NAMESPACE::app;
-using LIBC_NAMESPACE::AuxEntry;
-
-__attribute__((noinline)) static void do_start() {
-  auto tid = LIBC_NAMESPACE::syscall_impl<long>(SYS_gettid);
-  if (tid <= 0)
-    LIBC_NAMESPACE::syscall_impl<long>(SYS_exit, 1);
-  LIBC_NAMESPACE::main_thread_attrib.tid = static_cast<int>(tid);
-
-  // After the argv array, is a 8-byte long NULL value before the array of env
-  // values. The end of the env values is marked by another 8-byte long NULL
-  // value. We step over it (the "+ 1" below) to get to the env values.
-  uint64_t *env_ptr = app.args->argv + app.args->argc + 1;
-  uint64_t *env_end_marker = env_ptr;
-  app.env_ptr = env_ptr;
-  while (*env_end_marker)
-    ++env_end_marker;
-
-  // Initialize the POSIX global declared in unistd.h
-  environ = reinterpret_cast<char **>(env_ptr);
-
-  // After the env array, is the aux-vector. The end of the aux-vector is
-  // denoted by an AT_NULL entry.
-  Elf64_Phdr *program_hdr_table = nullptr;
-  uintptr_t program_hdr_count;
-  app.auxv_ptr = reinterpret_cast<AuxEntry *>(env_end_marker + 1);
-  for (auto *aux_entry = app.auxv_ptr; aux_entry->id != AT_NULL; ++aux_entry) {
-    switch (aux_entry->id) {
-    case AT_PHDR:
-      program_hdr_table = reinterpret_cast<Elf64_Phdr *>(aux_entry->value);
-      break;
-    case AT_PHNUM:
-      program_hdr_count = aux_entry->value;
-      break;
-    case AT_PAGESZ:
-      app.page_size = aux_entry->value;
-      break;
-    default:
-      break; // TODO: Read other useful entries from the aux vector.
-    }
-  }
-
-  app.tls.size = 0;
-  for (uintptr_t i = 0; i < program_hdr_count; ++i) {
-    Elf64_Phdr *phdr = program_hdr_table + i;
-    if (phdr->p_type != PT_TLS)
-      continue;
-    // TODO: p_vaddr value has to be adjusted for static-pie executables.
-    app.tls.address = phdr->p_vaddr;
-    app.tls.size = phdr->p_memsz;
-    app.tls.init_size = phdr->p_filesz;
-    app.tls.align = phdr->p_align;
-  }
-
-  // This descriptor has to be static since its cleanup function cannot
-  // capture the context.
-  static LIBC_NAMESPACE::TLSDescriptor tls;
-  LIBC_NAMESPACE::init_tls(tls);
-  if (tls.size != 0)
-    LIBC_NAMESPACE::set_thread_ptr(tls.tp);
-
-  LIBC_NAMESPACE::self.attrib = &LIBC_NAMESPACE::main_thread_attrib;
-  LIBC_NAMESPACE::main_thread_attrib.atexit_callback_mgr =
-      LIBC_NAMESPACE::internal::get_thread_atexit_callback_mgr();
-  // We register the cleanup_tls function to be the last atexit callback to be
-  // invoked. It will tear down the TLS. Other callbacks may depend on TLS (such
-  // as the stack protector canary).
-  LIBC_NAMESPACE::atexit(
-      []() { LIBC_NAMESPACE::cleanup_tls(tls.tp, tls.size); });
-  // We want the fini array callbacks to be run after other atexit
-  // callbacks are run. So, we register them before running the init
-  // array callbacks as they can potentially register their own atexit
-  // callbacks.
-  LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
-
-  LIBC_NAMESPACE::call_init_array_callbacks(
-      static_cast<int>(app.args->argc),
-      reinterpret_cast<char **>(app.args->argv),
-      reinterpret_cast<char **>(env_ptr));
-
-  int retval = main(static_cast<int>(app.args->argc),
-                    reinterpret_cast<char **>(app.args->argv),
-                    reinterpret_cast<char **>(env_ptr));
-
-  LIBC_NAMESPACE::exit(retval);
-}
-
-extern "C" void _start() {
+#include "startup/linux/do_start.h"
+extern "C" [[noreturn]] void _start() {
   // Skip the Frame Pointer and the Link Register
   // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst
   // Section 6.2.3. Note that this only works if the current function
@@ -223,7 +18,7 @@ extern "C" void _start() {
   // will take us to the previous stack pointer. That is the reason why the
   // actual business logic of the startup code is pushed into a non-inline
   // function do_start so that this function is free of any stack usage.
-  app.args = reinterpret_cast<LIBC_NAMESPACE::Args *>(
+  LIBC_NAMESPACE::app.args = reinterpret_cast<LIBC_NAMESPACE::Args *>(
       reinterpret_cast<uintptr_t *>(__builtin_frame_address(0)) + 2);
-  do_start();
+  LIBC_NAMESPACE::do_start();
 }
diff --git a/libc/startup/linux/aarch64/tls.cpp b/libc/startup/linux/aarch64/tls.cpp
new file mode 100644
index 0000000000000..f2579e821b1bf
--- /dev/null
+++ b/libc/startup/linux/aarch64/tls.cpp
@@ -0,0 +1,86 @@
+//===-- Implementation of tls for aarch64 ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/threads/thread.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+#include "startup/linux/do_start.h"
+
+#include <arm_acle.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+// Source documentation:
+// https://github.com/ARM-software/abi-aa/tree/main/sysvabi64
+
+namespace LIBC_NAMESPACE {
+
+#ifdef SYS_mmap2
+static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap2;
+#elif SYS_mmap
+static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap;
+#else
+#error "mmap and mmap2 syscalls not available."
+#endif
+
+void init_tls(TLSDescriptor &tls_descriptor) {
+  if (app.tls.size == 0) {
+    tls_descriptor.size = 0;
+    tls_descriptor.tp = 0;
+    return;
+  }
+
+  // aarch64 follows the variant 1 TLS layout:
+  //
+  // 1. First entry is the dynamic thread vector pointer
+  // 2. Second entry is a 8-byte reserved word.
+  // 3. Padding for alignment.
+  // 4. The TLS data from the ELF image.
+  //
+  // The thread pointer points to the first entry.
+
+  const uintptr_t size_of_pointers = 2 * sizeof(uintptr_t);
+  uintptr_t padding = 0;
+  const uintptr_t ALIGNMENT_MASK = app.tls.align - 1;
+  uintptr_t diff = size_of_pointers & ALIGNMENT_MASK;
+  if (diff != 0)
+    padding += (ALIGNMENT_MASK - diff) + 1;
+
+  uintptr_t alloc_size = size_of_pointers + padding + app.tls.size;
+
+  // We cannot call the mmap function here as the functions set errno on
+  // failure. Since errno is implemented via a thread local variable, we cannot
+  // use errno before TLS is setup.
+  long mmap_ret_val = syscall_impl<long>(MMAP_SYSCALL_NUMBER, nullptr,
+                                         alloc_size, PROT_READ | PROT_WRITE,
+                                         MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  // We cannot check the return value with MAP_FAILED as that is the return
+  // of the mmap function and not the mmap syscall.
+  if (mmap_ret_val < 0 && static_cast<uintptr_t>(mmap_ret_val) > -app.page_size)
+    syscall_impl<long>(SYS_exit, 1);
+  uintptr_t thread_ptr = uintptr_t(reinterpret_cast<uintptr_t *>(mmap_ret_val));
+  uintptr_t tls_addr = thread_ptr + size_of_pointers + padding;
+  inline_memcpy(reinterpret_cast<char *>(tls_addr),
+                reinterpret_cast<const char *>(app.tls.address),
+                app.tls.init_size);
+  tls_descriptor.size = alloc_size;
+  tls_descriptor.addr = thread_ptr;
+  tls_descriptor.tp = thread_ptr;
+}
+
+void cleanup_tls(uintptr_t addr, uintptr_t size) {
+  if (size == 0)
+    return;
+  syscall_impl<long>(SYS_munmap, addr, size);
+}
+
+bool set_thread_ptr(uintptr_t val) {
+  __arm_wsr64("tpidr_el0", val);
+  return true;
+}
+} // namespace LIBC_NAMESPACE
diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp
new file mode 100644
index 0000000000000..05dbd4488f588
--- /dev/null
+++ b/libc/startup/linux/do_start.cpp
@@ -0,0 +1,140 @@
+//===-- Implementation file of do_start -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "startup/linux/do_start.h"
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/threads/thread.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
+#include "src/unistd/environ.h"
+
+#include <linux/auxvec.h>
+#include <linux/elf.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+extern "C" int main(int argc, char **argv, char **envp);
+
+extern "C" {
+// These arrays are present in the .init_array and .fini_array sections.
+// The symbols are inserted by linker when it sees references to them.
+extern uintptr_t __preinit_array_start[];
+extern uintptr_t __preinit_array_end[];
+extern uintptr_t __init_array_start[];
+extern uintptr_t __init_array_end[];
+extern uintptr_t __fini_array_start[];
+extern uintptr_t __fini_array_end[];
+}
+
+namespace LIBC_NAMESPACE {
+// TODO: this symbol will be moved to config.linux.app
+AppProperties app;
+
+using InitCallback = void(int, char **, char **);
+using FiniCallback = void(void);
+
+static void call_init_array_callbacks(int argc, char **argv, char **env) {
+  size_t preinit_array_size = __preinit_array_end - __preinit_array_start;
+  for (size_t i = 0; i < preinit_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__preinit_array_start[i])(argc, argv, env);
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
+}
+
+static void call_fini_array_callbacks() {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = fini_array_size; i > 0; --i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
+}
+
+static ThreadAttributes main_thread_attrib;
+
+[[noreturn]] void do_start() {
+  auto tid = syscall_impl<long>(SYS_gettid);
+  if (tid <= 0)
+    syscall_impl<long>(SYS_exit, 1);
+  main_thread_attrib.tid = static_cast<int>(tid);
+
+  // After the argv array, is a 8-byte long NULL value before the array of env
+  // values. The end of the env values is marked by another 8-byte long NULL
+  // value. We step over it (the "+ 1" below) to get to the env values.
+  ArgVEntryType *env_ptr = app.args->argv + app.args->argc + 1;
+  ArgVEntryType *env_end_marker = env_ptr;
+  app.env_ptr = env_ptr;
+  while (*env_end_marker)
+    ++env_end_marker;
+
+  // Initialize the POSIX global declared in unistd.h
+  environ = reinterpret_cast<char **>(env_ptr);
+
+  // After the env array, is the aux-vector. The end of the aux-vector is
+  // denoted by an AT_NULL entry.
+  Elf64_Phdr *program_hdr_table = nullptr;
+  uintptr_t program_hdr_count = 0;
+  app.auxv_ptr = reinterpret_cast<AuxEntry *>(env_end_marker + 1);
+  for (auto *aux_entry = app.auxv_ptr; aux_entry->id != AT_NULL; ++aux_entry) {
+    switch (aux_entry->id) {
+    case AT_PHDR:
+      program_hdr_table = reinterpret_cast<Elf64_Phdr *>(aux_entry->value);
+      break;
+    case AT_PHNUM:
+      program_hdr_count = aux_entry->value;
+      break;
+    case AT_PAGESZ:
+      app.page_size = aux_entry->value;
+      break;
+    default:
+      break; // TODO: Read other useful entries from the aux vector.
+    }
+  }
+
+  app.tls.size = 0;
+  for (uintptr_t i = 0; i < program_hdr_count; ++i) {
+    Elf64_Phdr *phdr = program_hdr_table + i;
+    if (phdr->p_type != PT_TLS)
+      continue;
+    // TODO: p_vaddr value has to be adjusted for static-pie executables.
+    app.tls.address = phdr->p_vaddr;
+    app.tls.size = phdr->p_memsz;
+    app.tls.init_size = phdr->p_filesz;
+    app.tls.align = phdr->p_align;
+  }
+
+  // This descriptor has to be static since its cleanup function cannot
+  // capture the context.
+  static TLSDescriptor tls;
+  init_tls(tls);
+  if (tls.size != 0 && !set_thread_ptr(tls.tp))
+    syscall_impl<long>(SYS_exit, 1);
+
+  self.attrib = &main_thread_attrib;
+  main_thread_attrib.atexit_callback_mgr =
+      internal::get_thread_atexit_callback_mgr();
+  // We register the cleanup_tls function to be the last atexit callback to be
+  // invoked. It will tear down the TLS. Other callbacks may depend on TLS (such
+  // as the stack protector canary).
+  atexit([]() { cleanup_tls(tls.tp, tls.size); });
+  // We want the fini array callbacks to be run after other atexit
+  // callbacks are run. So, we register them before running the init
+  // array callbacks as they can potentially register their own atexit
+  // callbacks.
+  atexit(&call_fini_array_callbacks);
+
+  call_init_array_callbacks(static_cast<int>(app.args->argc),
+                            reinterpret_cast<char **>(app.args->argv),
+                            reinterpret_cast<char **>(env_ptr));
+
+  int retval = main(static_cast<int>(app.args->argc),
+                    reinterpret_cast<char **>(app.args->argv),
+                    reinterpret_cast<char **>(env_ptr));
+
+  exit(retval);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/startup/linux/do_start.h b/libc/startup/linux/do_start.h
new file mode 100644
index 0000000000000..a0e7a3cd69562
--- /dev/null
+++ b/libc/startup/linux/do_start.h
@@ -0,0 +1,14 @@
+//===-- Header file of do_start -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "config/linux/app.h"
+
+namespace LIBC_NAMESPACE {
+// setup the libc runtime and invoke the main routine.
+[[noreturn]] void do_start();
+} // namespace LIBC_NAMESPACE
diff --git a/libc/startup/linux/riscv/CMakeLists.txt b/libc/startup/linux/riscv/CMakeLists.txt
index b47db8eb5d23f..3717784233c15 100644
--- a/libc/startup/linux/riscv/CMakeLists.txt
+++ b/libc/startup/linux/riscv/CMakeLists.txt
@@ -1,17 +1,25 @@
 add_startup_object(
-  crt1
+  tls
   SRC
-    start.cpp
+    tls.cpp
   DEPENDS
     libc.config.linux.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
-    libc.src.__support.threads.thread
     libc.src.__support.OSUtil.osutil
-    libc.src.stdlib.exit
-    libc.src.stdlib.atexit
     libc.src.string.memory_utils.inline_memcpy
-    libc.src.unistd.environ
+  COMPILE_OPTIONS
+    -fno-omit-frame-pointer
+    -ffreestanding # To avoid compiler warnings about calling the main function.
+)
+
+add_startup_object(
+  start
+  SRC
+    start.cpp
+  DEPENDS
+    libc.config.linux.app_h
+    libc.src.__support.macros.attributes
   COMPILE_OPTIONS
     -fno-omit-frame-pointer
     -ffreestanding # To avoid compiler warnings about calling the main function.
diff --git a/libc/startup/linux/riscv/start.cpp b/libc/startup/linux/riscv/start.cpp
index 5b6e5bde8da81..389f71a66d30a 100644
--- a/libc/startup/linux/riscv/start.cpp
+++ b/libc/startup/linux/riscv/start.cpp
@@ -1,223 +1,20 @@
-//===-- Implementation of crt for riscv64 ---------------------------------===//
+//===-- Implementation of _start for riscv --------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
-#include "config/linux/app.h"
-#include "src/__support/OSUtil/syscall.h"
-#include "src/__support/threads/thread.h"
-#include "src/stdlib/atexit.h"
-#include "src/stdlib/exit.h"
-#include "src/string/memory_utils/inline_memcpy.h"
-
-#include <linux/auxvec.h>
-#include <linux/elf.h>
-#include <stdint.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-
-extern "C" int main(int, char **, char **);
-
-namespace LIBC_NAMESPACE {
-
-#ifdef SYS_mmap2
-static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap2;
-#elif SYS_mmap
-static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap;
-#else
-#error "mmap and mmap2 syscalls not available."
-#endif
-
-AppProperties app;
-
-static ThreadAttributes main_thread_attrib;
-
-void init_tls(TLSDescriptor &tls_descriptor) {
-  if (app.tls.size == 0) {
-    tls_descriptor.size = 0;
-    tls_descriptor.tp = 0;
-    return;
-  }
-
-  // riscv64 follows the variant 1 TLS layout:
-  const uintptr_t size_of_pointers = 2 * sizeof(uintptr_t);
-  uintptr_t padding = 0;
-  const uintptr_t ALIGNMENT_MASK = app.tls.align - 1;
-  uintptr_t diff = size_of_pointers & ALIGNMENT_MASK;
-  if (diff != 0)
-    padding += (ALIGNMENT_MASK - diff) + 1;
-
-  uintptr_t alloc_size = size_of_pointers + padding + app.tls.size;
-
-  // We cannot call the mmap function here as the functions set errno on
-  // failure. Since errno is implemented via a thread local variable, we cannot
-  // use errno before TLS is setup.
-  long mmap_ret_val = LIBC_NAMESPACE::syscall_impl<long>(
-      MMAP_SYSCALL_NUMBER, nullptr, alloc_size, PROT_READ | PROT_WRITE,
-      MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  // We cannot check the return value with MAP_FAILED as that is the return
-  // of the mmap function and not the mmap syscall.
-  if (mmap_ret_val < 0 && static_cast<uintptr_t>(mmap_ret_val) > -app.page_size)
-    LIBC_NAMESPACE::syscall_impl<long>(SYS_exit, 1);
-  uintptr_t thread_ptr = uintptr_t(reinterpret_cast<uintptr_t *>(mmap_ret_val));
-  uintptr_t tls_addr = thread_ptr + size_of_pointers + padding;
-  LIBC_NAMESPACE::inline_memcpy(reinterpret_cast<char *>(tls_addr),
-                                reinterpret_cast<const char *>(app.tls.address),
-                                app.tls.init_size);
-  tls_descriptor.size = alloc_size;
-  tls_descriptor.addr = thread_ptr;
-  tls_descriptor.tp = tls_addr;
-}
-
-void cleanup_tls(uintptr_t addr, uintptr_t size) {
-  if (size == 0)
-    return;
-  LIBC_NAMESPACE::syscall_impl<long>(SYS_munmap, addr, size);
-}
-
-static void set_thread_ptr(uintptr_t val) {
-  LIBC_INLINE_ASM("mv tp, %0\n\t" : : "r"(val));
-}
-
-using InitCallback = void(int, char **, char **);
-using FiniCallback = void(void);
-
-extern "C" {
-// These arrays are present in the .init_array and .fini_array sections.
-// The symbols are inserted by linker when it sees references to them.
-extern uintptr_t __preinit_array_start[];
-extern uintptr_t __preinit_array_end[];
-extern uintptr_t __init_array_start[];
-extern uintptr_t __init_array_end[];
-extern uintptr_t __fini_array_start[];
-extern uintptr_t __fini_array_end[];
-}
-
-static void call_init_array_callbacks(int argc, char **argv, char **env) {
-  size_t preinit_array_size = __preinit_array_end - __preinit_array_start;
-  for (size_t i = 0; i < preinit_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__preinit_array_start[i])(argc, argv, env);
-  size_t init_array_size = __init_array_end - __init_array_start;
-  for (size_t i = 0; i < init_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
-}
-
-static void call_fini_array_callbacks() {
-  size_t fini_array_size = __fini_array_end - __fini_array_start;
-  for (size_t i = fini_array_size; i > 0; --i)
-    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
-}
-
-} // namespace LIBC_NAMESPACE
-
-using LIBC_NAMESPACE::app;
-using LIBC_NAMESPACE::AuxEntry;
-
-#if defined(LIBC_TARGET_ARCH_IS_X86_64) ||                                     \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64) ||                                    \
-    defined(LIBC_TARGET_ARCH_IS_RISCV64)
-typedef Elf64_Phdr PgrHdrTableType;
-#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
-typedef Elf32_Phdr PgrHdrTableType;
-#else
-#error "Program header table type is not defined for the target platform."
-#endif
-
-__attribute__((noinline)) static void do_start() {
-  LIBC_INLINE_ASM(".option push\n\t"
-                  ".option norelax\n\t"
-                  "lla gp, __global_pointer$\n\t"
-                  ".option pop\n\t");
-  auto tid = LIBC_NAMESPACE::syscall_impl<long>(SYS_gettid);
-  if (tid <= 0)
-    LIBC_NAMESPACE::syscall_impl<long>(SYS_exit, 1);
-  LIBC_NAMESPACE::main_thread_attrib.tid = static_cast<int>(tid);
-
-  // After the argv array, is a 8-byte long NULL value before the array of env
-  // values. The end of the env values is marked by another 8-byte long NULL
-  // value. We step over it (the "+ 1" below) to get to the env values.
-  LIBC_NAMESPACE::ArgVEntryType *env_ptr = app.args->argv + app.args->argc + 1;
-  LIBC_NAMESPACE::ArgVEntryType *env_end_marker = env_ptr;
-  app.env_ptr = env_ptr;
-  while (*env_end_marker)
-    ++env_end_marker;
-
-  // Initialize the POSIX global declared in unistd.h
-  environ = reinterpret_cast<char **>(env_ptr);
-
-  // After the env array, is the aux-vector. The end of the aux-vector is
-  // denoted by an AT_NULL entry.
-  PgrHdrTableType *program_hdr_table = nullptr;
-  uintptr_t program_hdr_count;
-  app.auxv_ptr = reinterpret_cast<AuxEntry *>(env_end_marker + 1);
-  for (auto *aux_entry = app.auxv_ptr; aux_entry->id != AT_NULL; ++aux_entry) {
-    switch (aux_entry->id) {
-    case AT_PHDR:
-      program_hdr_table = reinterpret_cast<PgrHdrTableType *>(aux_entry->value);
-      break;
-    case AT_PHNUM:
-      program_hdr_count = aux_entry->value;
-      break;
-    case AT_PAGESZ:
-      app.page_size = aux_entry->value;
-      break;
-    default:
-      break; // TODO: Read other useful entries from the aux vector.
-    }
-  }
-
-  app.tls.size = 0;
-  for (uintptr_t i = 0; i < program_hdr_count; ++i) {
-    PgrHdrTableType *phdr = program_hdr_table + i;
-    if (phdr->p_type != PT_TLS)
-      continue;
-    // TODO: p_vaddr value has to be adjusted for static-pie executables.
-    app.tls.address = phdr->p_vaddr;
-    app.tls.size = phdr->p_memsz;
-    app.tls.init_size = phdr->p_filesz;
-    app.tls.align = phdr->p_align;
-  }
-
-  // This descriptor has to be static since its cleanup function cannot
-  // capture the context.
-  static LIBC_NAMESPACE::TLSDescriptor tls;
-  LIBC_NAMESPACE::init_tls(tls);
-  if (tls.size != 0)
-    LIBC_NAMESPACE::set_thread_ptr(tls.tp);
-
-  LIBC_NAMESPACE::self.attrib = &LIBC_NAMESPACE::main_thread_attrib;
-  LIBC_NAMESPACE::main_thread_attrib.atexit_callback_mgr =
-      LIBC_NAMESPACE::internal::get_thread_atexit_callback_mgr();
-  // We register the cleanup_tls function to be the last atexit callback to be
-  // invoked. It will tear down the TLS. Other callbacks may depend on TLS (such
-  // as the stack protector canary).
-  LIBC_NAMESPACE::atexit(
-      []() { LIBC_NAMESPACE::cleanup_tls(tls.tp, tls.size); });
-  // We want the fini array callbacks to be run after other atexit
-  // callbacks are run. So, we register them before running the init
-  // array callbacks as they can potentially register their own atexit
-  // callbacks.
-  LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
-
-  LIBC_NAMESPACE::call_init_array_callbacks(
-      static_cast<int>(app.args->argc),
-      reinterpret_cast<char **>(app.args->argv),
-      reinterpret_cast<char **>(env_ptr));
-
-  int retval = main(static_cast<int>(app.args->argc),
-                    reinterpret_cast<char **>(app.args->argv),
-                    reinterpret_cast<char **>(env_ptr));
-
-  LIBC_NAMESPACE::exit(retval);
-}
-
-extern "C" void _start() {
+#include "src/__support/macros/attributes.h"
+#include "startup/linux/do_start.h"
+
+extern "C" [[noreturn]] void _start() {
+  asm volatile(".option push\n\t"
+               ".option norelax\n\t"
+               "lla gp, __global_pointer$\n\t"
+               ".option pop\n\t");
   // Fetch the args using the frame pointer.
-  app.args = reinterpret_cast<LIBC_NAMESPACE::Args *>(
+  LIBC_NAMESPACE::app.args = reinterpret_cast<LIBC_NAMESPACE::Args *>(
       reinterpret_cast<uintptr_t *>(__builtin_frame_address(0)));
-  do_start();
+  LIBC_NAMESPACE::do_start();
 }
diff --git a/libc/startup/linux/riscv/tls.cpp b/libc/startup/linux/riscv/tls.cpp
new file mode 100644
index 0000000000000..997912c77e737
--- /dev/null
+++ b/libc/startup/linux/riscv/tls.cpp
@@ -0,0 +1,74 @@
+//===-- Implementation of tls for riscv -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/threads/thread.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+#include "startup/linux/do_start.h"
+
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+namespace LIBC_NAMESPACE {
+
+#ifdef SYS_mmap2
+static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap2;
+#elif SYS_mmap
+static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap;
+#else
+#error "mmap and mmap2 syscalls not available."
+#endif
+
+void init_tls(TLSDescriptor &tls_descriptor) {
+  if (app.tls.size == 0) {
+    tls_descriptor.size = 0;
+    tls_descriptor.tp = 0;
+    return;
+  }
+
+  // riscv64 follows the variant 1 TLS layout:
+  const uintptr_t size_of_pointers = 2 * sizeof(uintptr_t);
+  uintptr_t padding = 0;
+  const uintptr_t ALIGNMENT_MASK = app.tls.align - 1;
+  uintptr_t diff = size_of_pointers & ALIGNMENT_MASK;
+  if (diff != 0)
+    padding += (ALIGNMENT_MASK - diff) + 1;
+
+  uintptr_t alloc_size = size_of_pointers + padding + app.tls.size;
+
+  // We cannot call the mmap function here as the functions set errno on
+  // failure. Since errno is implemented via a thread local variable, we cannot
+  // use errno before TLS is setup.
+  long mmap_ret_val = syscall_impl<long>(MMAP_SYSCALL_NUMBER, nullptr,
+                                         alloc_size, PROT_READ | PROT_WRITE,
+                                         MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  // We cannot check the return value with MAP_FAILED as that is the return
+  // of the mmap function and not the mmap syscall.
+  if (mmap_ret_val < 0 && static_cast<uintptr_t>(mmap_ret_val) > -app.page_size)
+    syscall_impl<long>(SYS_exit, 1);
+  uintptr_t thread_ptr = uintptr_t(reinterpret_cast<uintptr_t *>(mmap_ret_val));
+  uintptr_t tls_addr = thread_ptr + size_of_pointers + padding;
+  inline_memcpy(reinterpret_cast<char *>(tls_addr),
+                reinterpret_cast<const char *>(app.tls.address),
+                app.tls.init_size);
+  tls_descriptor.size = alloc_size;
+  tls_descriptor.addr = thread_ptr;
+  tls_descriptor.tp = tls_addr;
+}
+
+void cleanup_tls(uintptr_t addr, uintptr_t size) {
+  if (size == 0)
+    return;
+  syscall_impl<long>(SYS_munmap, addr, size);
+}
+
+bool set_thread_ptr(uintptr_t val) {
+  LIBC_INLINE_ASM("mv tp, %0\n\t" : : "r"(val));
+  return true;
+}
+} // namespace LIBC_NAMESPACE
diff --git a/libc/startup/linux/x86_64/CMakeLists.txt b/libc/startup/linux/x86_64/CMakeLists.txt
index aac5a0626a176..30da7ab4e1ec3 100644
--- a/libc/startup/linux/x86_64/CMakeLists.txt
+++ b/libc/startup/linux/x86_64/CMakeLists.txt
@@ -1,22 +1,30 @@
 add_startup_object(
-  crt1
+  tls
   SRC
-    start.cpp
+    tls.cpp
   DEPENDS
     libc.config.linux.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
-    libc.include.unistd
-    libc.src.__support.threads.thread
     libc.src.__support.OSUtil.osutil
-    libc.src.stdlib.exit
-    libc.src.stdlib.abort
-    libc.src.stdlib.atexit
     libc.src.string.memory_utils.inline_memcpy
-    libc.src.unistd.environ
   COMPILE_OPTIONS
     -fno-stack-protector
     -fno-omit-frame-pointer
-    -ffreestanding # To avoid compiler warnings about calling the main function.
+    -ffreestanding
+    -fno-builtin
+)
+
+add_startup_object(
+  start
+  SRC
+    start.cpp
+  DEPENDS
+    libc.config.linux.app_h
+    libc.src.__support.macros.attributes
+  COMPILE_OPTIONS
+    -fno-stack-protector
+    -fno-omit-frame-pointer
+    -ffreestanding
     -fno-builtin
 )
diff --git a/libc/startup/linux/x86_64/start.cpp b/libc/startup/linux/x86_64/start.cpp
index bc03a3cb1de27..25da25a496daa 100644
--- a/libc/startup/linux/x86_64/start.cpp
+++ b/libc/startup/linux/x86_64/start.cpp
@@ -1,151 +1,18 @@
-//===-- Implementation of crt for x86_64 ----------------------------------===//
+//===-- Implementation of _start for x86_64 -------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
-#include "config/linux/app.h"
-#include "src/__support/OSUtil/io.h"
-#include "src/__support/OSUtil/syscall.h"
-#include "src/__support/threads/thread.h"
-#include "src/stdlib/abort.h"
-#include "src/stdlib/atexit.h"
-#include "src/stdlib/exit.h"
-#include "src/string/memory_utils/inline_memcpy.h"
-
-#include <asm/prctl.h>
-#include <linux/auxvec.h>
-#include <linux/elf.h>
-#include <stdint.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-
-extern "C" int main(int, char **, char **);
-
-namespace LIBC_NAMESPACE {
-
-#ifdef SYS_mmap2
-static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap2;
-#elif SYS_mmap
-static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap;
-#else
-#error "mmap and mmap2 syscalls not available."
-#endif
-
-AppProperties app;
-
-static ThreadAttributes main_thread_attrib;
-
-// TODO: The function is x86_64 specific. Move it to config/linux/app.h
-// and generalize it. Also, dynamic loading is not handled currently.
-void init_tls(TLSDescriptor &tls_descriptor) {
-  if (app.tls.size == 0) {
-    tls_descriptor.size = 0;
-    tls_descriptor.tp = 0;
-    return;
-  }
-
-  // We will assume the alignment is always a power of two.
-  uintptr_t tls_size = app.tls.size & -app.tls.align;
-  if (tls_size != app.tls.size)
-    tls_size += app.tls.align;
-
-  // Per the x86_64 TLS ABI, the entry pointed to by the thread pointer is the
-  // address of the TLS block. So, we add more size to accomodate this address
-  // entry.
-  // We also need to include space for the stack canary. The canary is at
-  // offset 0x28 (40) and is of size uintptr_t.
-  uintptr_t tls_size_with_addr = tls_size + sizeof(uintptr_t) + 40;
-
-  // We cannot call the mmap function here as the functions set errno on
-  // failure. Since errno is implemented via a thread local variable, we cannot
-  // use errno before TLS is setup.
-  long mmap_retval = LIBC_NAMESPACE::syscall_impl<long>(
-      MMAP_SYSCALL_NUMBER, nullptr, tls_size_with_addr, PROT_READ | PROT_WRITE,
-      MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  // We cannot check the return value with MAP_FAILED as that is the return
-  // of the mmap function and not the mmap syscall.
-  if (mmap_retval < 0 && static_cast<uintptr_t>(mmap_retval) > -app.page_size)
-    LIBC_NAMESPACE::syscall_impl<long>(SYS_exit, 1);
-  uintptr_t *tls_addr = reinterpret_cast<uintptr_t *>(mmap_retval);
-
-  // x86_64 TLS faces down from the thread pointer with the first entry
-  // pointing to the address of the first real TLS byte.
-  uintptr_t end_ptr = reinterpret_cast<uintptr_t>(tls_addr) + tls_size;
-  *reinterpret_cast<uintptr_t *>(end_ptr) = end_ptr;
-
-  LIBC_NAMESPACE::inline_memcpy(reinterpret_cast<char *>(tls_addr),
-                                reinterpret_cast<const char *>(app.tls.address),
-                                app.tls.init_size);
-  uintptr_t *stack_guard_addr = reinterpret_cast<uintptr_t *>(end_ptr + 40);
-  // Setting the stack guard to a random value.
-  // We cannot call the get_random function here as the function sets errno on
-  // failure. Since errno is implemented via a thread local variable, we cannot
-  // use errno before TLS is setup.
-  ssize_t stack_guard_retval = LIBC_NAMESPACE::syscall_impl<ssize_t>(
-      SYS_getrandom, reinterpret_cast<long>(stack_guard_addr), sizeof(uint64_t),
-      0);
-  if (stack_guard_retval < 0)
-    LIBC_NAMESPACE::syscall_impl(SYS_exit, 1);
-
-  tls_descriptor = {tls_size_with_addr, reinterpret_cast<uintptr_t>(tls_addr),
-                    end_ptr};
-  return;
-}
-
-void cleanup_tls(uintptr_t addr, uintptr_t size) {
-  if (size == 0)
-    return;
-  LIBC_NAMESPACE::syscall_impl<long>(SYS_munmap, addr, size);
-}
-
-// Sets the thread pointer to |val|. Returns true on success, false on failure.
-static bool set_thread_ptr(uintptr_t val) {
-  return LIBC_NAMESPACE::syscall_impl(SYS_arch_prctl, ARCH_SET_FS, val) != -1;
-}
-
-using InitCallback = void(int, char **, char **);
-using FiniCallback = void(void);
-
-extern "C" {
-// These arrays are present in the .init_array and .fini_array sections.
-// The symbols are inserted by linker when it sees references to them.
-extern uintptr_t __preinit_array_start[];
-extern uintptr_t __preinit_array_end[];
-extern uintptr_t __init_array_start[];
-extern uintptr_t __init_array_end[];
-extern uintptr_t __fini_array_start[];
-extern uintptr_t __fini_array_end[];
-}
-
-static void call_init_array_callbacks(int argc, char **argv, char **env) {
-  size_t preinit_array_size = __preinit_array_end - __preinit_array_start;
-  for (size_t i = 0; i < preinit_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__preinit_array_start[i])(argc, argv, env);
-  size_t init_array_size = __init_array_end - __init_array_start;
-  for (size_t i = 0; i < init_array_size; ++i)
-    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
-}
-
-static void call_fini_array_callbacks() {
-  size_t fini_array_size = __fini_array_end - __fini_array_start;
-  for (size_t i = fini_array_size; i > 0; --i)
-    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
-}
-
-} // namespace LIBC_NAMESPACE
-
-using LIBC_NAMESPACE::app;
-using LIBC_NAMESPACE::AuxEntry;
-
-extern "C" void _start() {
-  // This TU is compiled with -fno-omit-frame-pointer. Hence, the previous value
-  // of the base pointer is pushed on to the stack. So, we step over it (the
-  // "+ 1" below) to get to the args.
-  app.args = reinterpret_cast<LIBC_NAMESPACE::Args *>(
+#include "src/__support/macros/attributes.h"
+#include "startup/linux/do_start.h"
+
+extern "C" [[noreturn]] void _start() {
+  // This TU is compiled with -fno-omit-frame-pointer. Hence, the previous
+  // value of the base pointer is pushed on to the stack. So, we step over
+  // it (the "+ 1" below) to get to the args.
+  LIBC_NAMESPACE::app.args = reinterpret_cast<LIBC_NAMESPACE::Args *>(
       reinterpret_cast<uintptr_t *>(__builtin_frame_address(0)) + 1);
 
   // The x86_64 ABI requires that the stack pointer is aligned to a 16-byte
@@ -159,88 +26,8 @@ extern "C" void _start() {
   // compilers can generate code assuming the alignment as required by the ABI.
   // If the stack pointers as setup by the OS are already aligned, then the
   // following code is a NOP.
-  __asm__ __volatile__("andq $0xfffffffffffffff0, %rsp\n\t");
-  __asm__ __volatile__("andq $0xfffffffffffffff0, %rbp\n\t");
-
-  auto tid = LIBC_NAMESPACE::syscall_impl<long>(SYS_gettid);
-  if (tid <= 0)
-    LIBC_NAMESPACE::syscall_impl<long>(SYS_exit, 1);
-  LIBC_NAMESPACE::main_thread_attrib.tid = static_cast<int>(tid);
-
-  // After the argv array, is a 8-byte long NULL value before the array of env
-  // values. The end of the env values is marked by another 8-byte long NULL
-  // value. We step over it (the "+ 1" below) to get to the env values.
-  uint64_t *env_ptr = app.args->argv + app.args->argc + 1;
-  uint64_t *env_end_marker = env_ptr;
-  app.env_ptr = env_ptr;
-  while (*env_end_marker)
-    ++env_end_marker;
-
-  // Initialize the POSIX global declared in unistd.h
-  environ = reinterpret_cast<char **>(env_ptr);
-
-  // After the env array, is the aux-vector. The end of the aux-vector is
-  // denoted by an AT_NULL entry.
-  Elf64_Phdr *program_hdr_table = nullptr;
-  uintptr_t program_hdr_count = 0;
-  app.auxv_ptr = reinterpret_cast<AuxEntry *>(env_end_marker + 1);
-  for (auto *aux_entry = app.auxv_ptr; aux_entry->id != AT_NULL; ++aux_entry) {
-    switch (aux_entry->id) {
-    case AT_PHDR:
-      program_hdr_table = reinterpret_cast<Elf64_Phdr *>(aux_entry->value);
-      break;
-    case AT_PHNUM:
-      program_hdr_count = aux_entry->value;
-      break;
-    case AT_PAGESZ:
-      app.page_size = aux_entry->value;
-      break;
-    default:
-      break; // TODO: Read other useful entries from the aux vector.
-    }
-  }
-
-  app.tls.size = 0;
-  for (uintptr_t i = 0; i < program_hdr_count; ++i) {
-    Elf64_Phdr *phdr = program_hdr_table + i;
-    if (phdr->p_type != PT_TLS)
-      continue;
-    // TODO: p_vaddr value has to be adjusted for static-pie executables.
-    app.tls.address = phdr->p_vaddr;
-    app.tls.size = phdr->p_memsz;
-    app.tls.init_size = phdr->p_filesz;
-    app.tls.align = phdr->p_align;
-  }
-
-  // This descriptor has to be static since its cleanup function cannot
-  // capture the context.
-  static LIBC_NAMESPACE::TLSDescriptor tls;
-  LIBC_NAMESPACE::init_tls(tls);
-  if (tls.size != 0 && !LIBC_NAMESPACE::set_thread_ptr(tls.tp))
-    LIBC_NAMESPACE::syscall_impl<long>(SYS_exit, 1);
-
-  LIBC_NAMESPACE::self.attrib = &LIBC_NAMESPACE::main_thread_attrib;
-  LIBC_NAMESPACE::main_thread_attrib.atexit_callback_mgr =
-      LIBC_NAMESPACE::internal::get_thread_atexit_callback_mgr();
-  // We register the cleanup_tls function to be the last atexit callback to be
-  // invoked. It will tear down the TLS. Other callbacks may depend on TLS (such
-  // as the stack protector canary).
-  LIBC_NAMESPACE::atexit(
-      []() { LIBC_NAMESPACE::cleanup_tls(tls.tp, tls.size); });
-  // We want the fini array callbacks to be run after other atexit
-  // callbacks are run. So, we register them before running the init
-  // array callbacks as they can potentially register their own atexit
-  // callbacks.
-  LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
-
-  LIBC_NAMESPACE::call_init_array_callbacks(
-      static_cast<int>(app.args->argc),
-      reinterpret_cast<char **>(app.args->argv),
-      reinterpret_cast<char **>(env_ptr));
-
-  int retval = main(static_cast<int>(app.args->argc),
-                    reinterpret_cast<char **>(app.args->argv),
-                    reinterpret_cast<char **>(env_ptr));
+  asm volatile("andq $0xfffffffffffffff0, %rsp\n\t");
+  asm volatile("andq $0xfffffffffffffff0, %rbp\n\t");
 
-  LIBC_NAMESPACE::exit(retval);
+  LIBC_NAMESPACE::do_start();
 }
diff --git a/libc/startup/linux/x86_64/tls.cpp b/libc/startup/linux/x86_64/tls.cpp
new file mode 100644
index 0000000000000..8b0fa98736244
--- /dev/null
+++ b/libc/startup/linux/x86_64/tls.cpp
@@ -0,0 +1,93 @@
+//===-- Implementation of tls for x86_64 ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/OSUtil/syscall.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+#include "startup/linux/do_start.h"
+
+#include <asm/prctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+namespace LIBC_NAMESPACE {
+
+#ifdef SYS_mmap2
+static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap2;
+#elif SYS_mmap
+static constexpr long MMAP_SYSCALL_NUMBER = SYS_mmap;
+#else
+#error "mmap and mmap2 syscalls not available."
+#endif
+
+// TODO: Also generalize this routine and handle dynamic loading properly.
+void init_tls(TLSDescriptor &tls_descriptor) {
+  if (app.tls.size == 0) {
+    tls_descriptor.size = 0;
+    tls_descriptor.tp = 0;
+    return;
+  }
+
+  // We will assume the alignment is always a power of two.
+  uintptr_t tls_size = app.tls.size & -app.tls.align;
+  if (tls_size != app.tls.size)
+    tls_size += app.tls.align;
+
+  // Per the x86_64 TLS ABI, the entry pointed to by the thread pointer is the
+  // address of the TLS block. So, we add more size to accomodate this address
+  // entry.
+  // We also need to include space for the stack canary. The canary is at
+  // offset 0x28 (40) and is of size uintptr_t.
+  uintptr_t tls_size_with_addr = tls_size + sizeof(uintptr_t) + 40;
+
+  // We cannot call the mmap function here as the functions set errno on
+  // failure. Since errno is implemented via a thread local variable, we cannot
+  // use errno before TLS is setup.
+  long mmap_retval = syscall_impl<long>(
+      MMAP_SYSCALL_NUMBER, nullptr, tls_size_with_addr, PROT_READ | PROT_WRITE,
+      MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  // We cannot check the return value with MAP_FAILED as that is the return
+  // of the mmap function and not the mmap syscall.
+  if (mmap_retval < 0 && static_cast<uintptr_t>(mmap_retval) > -app.page_size)
+    syscall_impl<long>(SYS_exit, 1);
+  uintptr_t *tls_addr = reinterpret_cast<uintptr_t *>(mmap_retval);
+
+  // x86_64 TLS faces down from the thread pointer with the first entry
+  // pointing to the address of the first real TLS byte.
+  uintptr_t end_ptr = reinterpret_cast<uintptr_t>(tls_addr) + tls_size;
+  *reinterpret_cast<uintptr_t *>(end_ptr) = end_ptr;
+
+  inline_memcpy(reinterpret_cast<char *>(tls_addr),
+                reinterpret_cast<const char *>(app.tls.address),
+                app.tls.init_size);
+  uintptr_t *stack_guard_addr = reinterpret_cast<uintptr_t *>(end_ptr + 40);
+  // Setting the stack guard to a random value.
+  // We cannot call the get_random function here as the function sets errno on
+  // failure. Since errno is implemented via a thread local variable, we cannot
+  // use errno before TLS is setup.
+  long stack_guard_retval =
+      syscall_impl(SYS_getrandom, reinterpret_cast<long>(stack_guard_addr),
+                   sizeof(uint64_t), 0);
+  if (stack_guard_retval < 0)
+    syscall_impl(SYS_exit, 1);
+
+  tls_descriptor = {tls_size_with_addr, reinterpret_cast<uintptr_t>(tls_addr),
+                    end_ptr};
+  return;
+}
+
+void cleanup_tls(uintptr_t addr, uintptr_t size) {
+  if (size == 0)
+    return;
+  syscall_impl<long>(SYS_munmap, addr, size);
+}
+
+// Sets the thread pointer to |val|. Returns true on success, false on failure.
+bool set_thread_ptr(uintptr_t val) {
+  return syscall_impl(SYS_arch_prctl, ARCH_SET_FS, val) != -1;
+}
+} // namespace LIBC_NAMESPACE

From a8cb4f7273ac4ea6f9cc3c03ed65a32542e947fe Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Thu, 4 Jan 2024 12:59:31 -0800
Subject: [PATCH 284/313] [RISCV][llvm-mca] Fix failing strided-load-x0.s test

58f1640635feff282935153d295dfb4ea1818401 was committed but had a broken
test that I did not add the updated version to the commit. This patch
fixes the test.
---
 .../llvm-mca/RISCV/SiFive7/strided-load-x0.s  | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
index eace2ad12d654..19864bed87220 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
@@ -37,13 +37,13 @@ vle64.v v1, (a1)
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      26
-# CHECK-NEXT: Total Cycles:      3546
+# CHECK-NEXT: Total Cycles:      234
 # CHECK-NEXT: Total uOps:        26
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.01
-# CHECK-NEXT: IPC:               0.01
-# CHECK-NEXT: Block RThroughput: 3541.0
+# CHECK-NEXT: uOps Per Cycle:    0.11
+# CHECK-NEXT: IPC:               0.11
+# CHECK-NEXT: Block RThroughput: 229.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -55,26 +55,26 @@ vle64.v v1, (a1)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      515   513.00  *                   vlse8.v	v1, (a1), a2
-# CHECK-NEXT:  1      259   257.00  *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse16.v	v1, (a1), a2
 # CHECK-NEXT:  1      19    17.00   *                   vlse32.v	v1, (a1), a2
-# CHECK-NEXT:  1      67    65.00   *                   vlse64.v	v1, (a1), a2
-# CHECK-NEXT:  1      515   513.00  *                   vlse8.v	v1, (a1), zero
-# CHECK-NEXT:  1      259   257.00  *                   vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  1      19    17.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    17.00   *                   vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  1      19    17.00   *                   vlse16.v	v1, (a1), zero
 # CHECK-NEXT:  1      19    17.00   *                   vlse32.v	v1, (a1), zero
-# CHECK-NEXT:  1      67    65.00   *                   vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  1      19    17.00   *                   vlse64.v	v1, (a1), zero
 # CHECK-NEXT:  1      4     2.00    *                   vle8.v	v1, (a1)
 # CHECK-NEXT:  1      4     2.00    *                   vle16.v	v1, (a1)
 # CHECK-NEXT:  1      4     3.00    *                   vle32.v	v1, (a1)
 # CHECK-NEXT:  1      4     5.00    *                   vle64.v	v1, (a1)
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      515   513.00  *                   vlse8.v	v1, (a1), a2
-# CHECK-NEXT:  1      259   257.00  *                   vlse16.v	v1, (a1), a2
-# CHECK-NEXT:  1      131   129.00  *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    9.00    *                   vlse32.v	v1, (a1), a2
 # CHECK-NEXT:  1      11    9.00    *                   vlse64.v	v1, (a1), a2
-# CHECK-NEXT:  1      515   513.00  *                   vlse8.v	v1, (a1), zero
-# CHECK-NEXT:  1      259   257.00  *                   vlse16.v	v1, (a1), zero
-# CHECK-NEXT:  1      131   129.00  *                   vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  1      11    9.00    *                   vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  1      11    9.00    *                   vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  1      11    9.00    *                   vlse32.v	v1, (a1), zero
 # CHECK-NEXT:  1      11    9.00    *                   vlse64.v	v1, (a1), zero
 # CHECK-NEXT:  1      4     2.00    *                   vle8.v	v1, (a1)
 # CHECK-NEXT:  1      4     2.00    *                   vle16.v	v1, (a1)
@@ -93,31 +93,31 @@ vle64.v v1, (a1)
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
-# CHECK-NEXT:  -      -     2.00    -      -     24.00  3541.00  -
+# CHECK-NEXT:  -      -     2.00    -      -     24.00  229.00  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00   513.00  -     vlse8.v	v1, (a1), a2
-# CHECK-NEXT:  -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
 # CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
-# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2
-# CHECK-NEXT:  -      -      -      -      -     1.00   513.00  -     vlse8.v	v1, (a1), zero
-# CHECK-NEXT:  -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), zero
 # CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), zero
-# CHECK-NEXT:  -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), zero
 # CHECK-NEXT:  -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a1)
 # CHECK-NEXT:  -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a1)
 # CHECK-NEXT:  -      -      -      -      -     1.00   3.00    -     vle32.v	v1, (a1)
 # CHECK-NEXT:  -      -      -      -      -     1.00   5.00    -     vle64.v	v1, (a1)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00   513.00  -     vlse8.v	v1, (a1), a2
-# CHECK-NEXT:  -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), a2
-# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse32.v	v1, (a1), a2
 # CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse64.v	v1, (a1), a2
-# CHECK-NEXT:  -      -      -      -      -     1.00   513.00  -     vlse8.v	v1, (a1), zero
-# CHECK-NEXT:  -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), zero
-# CHECK-NEXT:  -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse32.v	v1, (a1), zero
 # CHECK-NEXT:  -      -      -      -      -     1.00   9.00    -     vlse64.v	v1, (a1), zero
 # CHECK-NEXT:  -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a1)
 # CHECK-NEXT:  -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a1)

From 40b4ac278e87e2b98e52094b72728936df3e92d9 Mon Sep 17 00:00:00 2001
From: Ben Langmuir <blangmuir@apple.com>
Date: Thu, 4 Jan 2024 13:13:22 -0800
Subject: [PATCH 285/313] [ORC] Refactor executor symbol lookup to use
 ExecutorSymbolDef (NFC) (#76989)

This migrates the dylib manager lookup and related APIs to replace
ExecutorAddress with ExecutorSymbolDef so that in the future we can
model JITSymbolFlags for these symbols. The current change should be NFC
as we are only setting the Exported symbol flag.
---
 compiler-rt/lib/orc/executor_symbol_def.h     | 151 ++++++++++++++++++
 compiler-rt/lib/orc/tests/unit/CMakeLists.txt |   1 +
 .../tests/unit/executor_symbol_def_test.cpp   |  19 +++
 .../unit/simple_packed_serialization_test.cpp |  20 +--
 .../unit/simple_packed_serialization_utils.h  |  34 ++++
 .../Orc/EPCGenericDylibManager.h              |   7 +-
 .../Orc/Shared/ExecutorSymbolDef.h            |  58 +++++++
 .../ExecutionEngine/Orc/Shared/OrcRTBridge.h  |   3 +-
 .../Orc/Shared/TargetProcessControlTypes.h    |   3 +-
 .../SimpleExecutorDylibManager.h              |   5 +-
 .../Orc/EPCDebugObjectRegistrar.cpp           |   3 +-
 .../Orc/EPCDynamicLibrarySearchGenerator.cpp  |   4 +-
 .../Orc/EPCGenericDylibManager.cpp            |  10 +-
 .../Orc/ExecutorProcessControl.cpp            |   6 +-
 .../Orc/LookupAndRecordAddrs.cpp              |   2 +-
 .../SimpleExecutorDylibManager.cpp            |   9 +-
 llvm/tools/lli/ForwardingMemoryManager.h      |   9 +-
 17 files changed, 300 insertions(+), 44 deletions(-)
 create mode 100644 compiler-rt/lib/orc/executor_symbol_def.h
 create mode 100644 compiler-rt/lib/orc/tests/unit/executor_symbol_def_test.cpp
 create mode 100644 compiler-rt/lib/orc/tests/unit/simple_packed_serialization_utils.h

diff --git a/compiler-rt/lib/orc/executor_symbol_def.h b/compiler-rt/lib/orc/executor_symbol_def.h
new file mode 100644
index 0000000000000..454cefe525cfa
--- /dev/null
+++ b/compiler-rt/lib/orc/executor_symbol_def.h
@@ -0,0 +1,151 @@
+//===--------- ExecutorSymbolDef.h - (Addr, Flags) pair ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Represents a defining location for a symbol in the executing program.
+//
+// This file was derived from
+// llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_EXECUTOR_SYMBOL_DEF_H
+#define ORC_RT_EXECUTOR_SYMBOL_DEF_H
+
+#include "bitmask_enum.h"
+#include "executor_address.h"
+#include "simple_packed_serialization.h"
+
+namespace __orc_rt {
+
+/// Flags for symbols in the JIT.
+class JITSymbolFlags {
+public:
+  using UnderlyingType = uint8_t;
+  using TargetFlagsType = uint8_t;
+
+  /// These values must be kept in sync with \c JITSymbolFlags in the JIT.
+  enum FlagNames : UnderlyingType {
+    None = 0,
+    HasError = 1U << 0,
+    Weak = 1U << 1,
+    Common = 1U << 2,
+    Absolute = 1U << 3,
+    Exported = 1U << 4,
+    Callable = 1U << 5,
+    MaterializationSideEffectsOnly = 1U << 6,
+    ORC_RT_MARK_AS_BITMASK_ENUM( // LargestValue =
+        MaterializationSideEffectsOnly)
+  };
+
+  /// Default-construct a JITSymbolFlags instance.
+  JITSymbolFlags() = default;
+
+  /// Construct a JITSymbolFlags instance from the given flags and target
+  ///        flags.
+  JITSymbolFlags(FlagNames Flags, TargetFlagsType TargetFlags)
+      : TargetFlags(TargetFlags), Flags(Flags) {}
+
+  bool operator==(const JITSymbolFlags &RHS) const {
+    return Flags == RHS.Flags && TargetFlags == RHS.TargetFlags;
+  }
+
+  /// Get the underlying flags value as an integer.
+  UnderlyingType getRawFlagsValue() const {
+    return static_cast<UnderlyingType>(Flags);
+  }
+
+  /// Return a reference to the target-specific flags.
+  TargetFlagsType &getTargetFlags() { return TargetFlags; }
+
+  /// Return a reference to the target-specific flags.
+  const TargetFlagsType &getTargetFlags() const { return TargetFlags; }
+
+private:
+  TargetFlagsType TargetFlags = 0;
+  FlagNames Flags = None;
+};
+
+/// Represents a defining location for a JIT symbol.
+class ExecutorSymbolDef {
+public:
+  ExecutorSymbolDef() = default;
+  ExecutorSymbolDef(ExecutorAddr Addr, JITSymbolFlags Flags)
+      : Addr(Addr), Flags(Flags) {}
+
+  const ExecutorAddr &getAddress() const { return Addr; }
+
+  const JITSymbolFlags &getFlags() const { return Flags; }
+
+  friend bool operator==(const ExecutorSymbolDef &LHS,
+                         const ExecutorSymbolDef &RHS) {
+    return LHS.getAddress() == RHS.getAddress() &&
+           LHS.getFlags() == RHS.getFlags();
+  }
+
+private:
+  ExecutorAddr Addr;
+  JITSymbolFlags Flags;
+};
+
+using SPSJITSymbolFlags =
+    SPSTuple<JITSymbolFlags::UnderlyingType, JITSymbolFlags::TargetFlagsType>;
+
+/// SPS serializatior for JITSymbolFlags.
+template <> class SPSSerializationTraits<SPSJITSymbolFlags, JITSymbolFlags> {
+  using FlagsArgList = SPSJITSymbolFlags::AsArgList;
+
+public:
+  static size_t size(const JITSymbolFlags &F) {
+    return FlagsArgList::size(F.getRawFlagsValue(), F.getTargetFlags());
+  }
+
+  static bool serialize(SPSOutputBuffer &BOB, const JITSymbolFlags &F) {
+    return FlagsArgList::serialize(BOB, F.getRawFlagsValue(),
+                                   F.getTargetFlags());
+  }
+
+  static bool deserialize(SPSInputBuffer &BIB, JITSymbolFlags &F) {
+    JITSymbolFlags::UnderlyingType RawFlags;
+    JITSymbolFlags::TargetFlagsType TargetFlags;
+    if (!FlagsArgList::deserialize(BIB, RawFlags, TargetFlags))
+      return false;
+    F = JITSymbolFlags{static_cast<JITSymbolFlags::FlagNames>(RawFlags),
+                       TargetFlags};
+    return true;
+  }
+};
+
+using SPSExecutorSymbolDef = SPSTuple<SPSExecutorAddr, SPSJITSymbolFlags>;
+
+/// SPS serializatior for ExecutorSymbolDef.
+template <>
+class SPSSerializationTraits<SPSExecutorSymbolDef, ExecutorSymbolDef> {
+  using DefArgList = SPSExecutorSymbolDef::AsArgList;
+
+public:
+  static size_t size(const ExecutorSymbolDef &ESD) {
+    return DefArgList::size(ESD.getAddress(), ESD.getFlags());
+  }
+
+  static bool serialize(SPSOutputBuffer &BOB, const ExecutorSymbolDef &ESD) {
+    return DefArgList::serialize(BOB, ESD.getAddress(), ESD.getFlags());
+  }
+
+  static bool deserialize(SPSInputBuffer &BIB, ExecutorSymbolDef &ESD) {
+    ExecutorAddr Addr;
+    JITSymbolFlags Flags;
+    if (!DefArgList::deserialize(BIB, Addr, Flags))
+      return false;
+    ESD = ExecutorSymbolDef{Addr, Flags};
+    return true;
+  }
+};
+
+} // End namespace __orc_rt
+
+#endif // ORC_RT_EXECUTOR_SYMBOL_DEF_H
diff --git a/compiler-rt/lib/orc/tests/unit/CMakeLists.txt b/compiler-rt/lib/orc/tests/unit/CMakeLists.txt
index 81df7e803bc58..aec689a407be6 100644
--- a/compiler-rt/lib/orc/tests/unit/CMakeLists.txt
+++ b/compiler-rt/lib/orc/tests/unit/CMakeLists.txt
@@ -5,6 +5,7 @@ set(UNITTEST_SOURCES
   endian_test.cpp
   error_test.cpp
   executor_address_test.cpp
+  executor_symbol_def_test.cpp
   extensible_rtti_test.cpp
   interval_map_test.cpp
   interval_set_test.cpp
diff --git a/compiler-rt/lib/orc/tests/unit/executor_symbol_def_test.cpp b/compiler-rt/lib/orc/tests/unit/executor_symbol_def_test.cpp
new file mode 100644
index 0000000000000..181091ca1e60c
--- /dev/null
+++ b/compiler-rt/lib/orc/tests/unit/executor_symbol_def_test.cpp
@@ -0,0 +1,19 @@
+//===-- executor_symbol_def_test.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "executor_symbol_def.h"
+#include "simple_packed_serialization_utils.h"
+#include "gtest/gtest.h"
+
+using namespace __orc_rt;
+
+TEST(ExecutorSymbolDefTest, Serialization) {
+  blobSerializationRoundTrip<SPSExecutorSymbolDef>(ExecutorSymbolDef{});
+  blobSerializationRoundTrip<SPSExecutorSymbolDef>(
+      ExecutorSymbolDef{ExecutorAddr{0x70}, {JITSymbolFlags::Callable, 9}});
+}
\ No newline at end of file
diff --git a/compiler-rt/lib/orc/tests/unit/simple_packed_serialization_test.cpp b/compiler-rt/lib/orc/tests/unit/simple_packed_serialization_test.cpp
index e7a78062b210e..397114b4017e0 100644
--- a/compiler-rt/lib/orc/tests/unit/simple_packed_serialization_test.cpp
+++ b/compiler-rt/lib/orc/tests/unit/simple_packed_serialization_test.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "simple_packed_serialization.h"
+#include "simple_packed_serialization_utils.h"
 #include "gtest/gtest.h"
 
 using namespace __orc_rt;
@@ -48,25 +49,6 @@ TEST(SimplePackedSerializationTest, SPSInputBuffer) {
   EXPECT_FALSE(IB.read(&C, 1));
 }
 
-template <typename SPSTagT, typename T>
-static void blobSerializationRoundTrip(const T &Value) {
-  using BST = SPSSerializationTraits<SPSTagT, T>;
-
-  size_t Size = BST::size(Value);
-  auto Buffer = std::make_unique<char[]>(Size);
-  SPSOutputBuffer OB(Buffer.get(), Size);
-
-  EXPECT_TRUE(BST::serialize(OB, Value));
-
-  SPSInputBuffer IB(Buffer.get(), Size);
-
-  T DSValue;
-  EXPECT_TRUE(BST::deserialize(IB, DSValue));
-
-  EXPECT_EQ(Value, DSValue)
-      << "Incorrect value after serialization/deserialization round-trip";
-}
-
 template <typename T> static void testFixedIntegralTypeSerialization() {
   blobSerializationRoundTrip<T, T>(0);
   blobSerializationRoundTrip<T, T>(static_cast<T>(1));
diff --git a/compiler-rt/lib/orc/tests/unit/simple_packed_serialization_utils.h b/compiler-rt/lib/orc/tests/unit/simple_packed_serialization_utils.h
new file mode 100644
index 0000000000000..746be43d250bd
--- /dev/null
+++ b/compiler-rt/lib/orc/tests/unit/simple_packed_serialization_utils.h
@@ -0,0 +1,34 @@
+//===-- simple_packed_serialization_utils.h -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_TEST_SIMPLE_PACKED_SERIALIZATION_UTILS_H
+#define ORC_RT_TEST_SIMPLE_PACKED_SERIALIZATION_UTILS_H
+
+#include "simple_packed_serialization.h"
+#include "gtest/gtest.h"
+
+template <typename SPSTagT, typename T>
+static void blobSerializationRoundTrip(const T &Value) {
+  using BST = __orc_rt::SPSSerializationTraits<SPSTagT, T>;
+
+  size_t Size = BST::size(Value);
+  auto Buffer = std::make_unique<char[]>(Size);
+  __orc_rt::SPSOutputBuffer OB(Buffer.get(), Size);
+
+  EXPECT_TRUE(BST::serialize(OB, Value));
+
+  __orc_rt::SPSInputBuffer IB(Buffer.get(), Size);
+
+  T DSValue;
+  EXPECT_TRUE(BST::deserialize(IB, DSValue));
+
+  EXPECT_EQ(Value, DSValue)
+      << "Incorrect value after serialization/deserialization round-trip";
+}
+
+#endif // ORC_RT_TEST_SIMPLE_PACKED_SERIALIZATION_UTILS_H
\ No newline at end of file
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
index 02e580c86f548..6ee2deef04d09 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
@@ -19,6 +19,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_EPCGENERICDYLIBMANAGER_H
 
 #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
 #include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
 
 namespace llvm {
@@ -49,11 +50,11 @@ class EPCGenericDylibManager {
   Expected<tpctypes::DylibHandle> open(StringRef Path, uint64_t Mode);
 
   /// Looks up symbols within the given dylib.
-  Expected<std::vector<ExecutorAddr>> lookup(tpctypes::DylibHandle H,
-                                             const SymbolLookupSet &Lookup);
+  Expected<std::vector<ExecutorSymbolDef>>
+  lookup(tpctypes::DylibHandle H, const SymbolLookupSet &Lookup);
 
   /// Looks up symbols within the given dylib.
-  Expected<std::vector<ExecutorAddr>>
+  Expected<std::vector<ExecutorSymbolDef>>
   lookup(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &Lookup);
 
 private:
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h
index 5c58a7255ebd8..68ccdf83bd120 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
 
 namespace llvm {
 namespace orc {
@@ -48,6 +49,63 @@ class ExecutorSymbolDef {
   JITSymbolFlags Flags;
 };
 
+namespace shared {
+
+using SPSJITSymbolFlags =
+    SPSTuple<JITSymbolFlags::UnderlyingType, JITSymbolFlags::TargetFlagsType>;
+
+/// SPS serializatior for JITSymbolFlags.
+template <> class SPSSerializationTraits<SPSJITSymbolFlags, JITSymbolFlags> {
+  using FlagsArgList = SPSJITSymbolFlags::AsArgList;
+
+public:
+  static size_t size(const JITSymbolFlags &F) {
+    return FlagsArgList::size(F.getRawFlagsValue(), F.getTargetFlags());
+  }
+
+  static bool serialize(SPSOutputBuffer &BOB, const JITSymbolFlags &F) {
+    return FlagsArgList::serialize(BOB, F.getRawFlagsValue(),
+                                   F.getTargetFlags());
+  }
+
+  static bool deserialize(SPSInputBuffer &BIB, JITSymbolFlags &F) {
+    JITSymbolFlags::UnderlyingType RawFlags;
+    JITSymbolFlags::TargetFlagsType TargetFlags;
+    if (!FlagsArgList::deserialize(BIB, RawFlags, TargetFlags))
+      return false;
+    F = JITSymbolFlags{static_cast<JITSymbolFlags::FlagNames>(RawFlags),
+                       TargetFlags};
+    return true;
+  }
+};
+
+using SPSExecutorSymbolDef = SPSTuple<SPSExecutorAddr, SPSJITSymbolFlags>;
+
+/// SPS serializatior for ExecutorSymbolDef.
+template <>
+class SPSSerializationTraits<SPSExecutorSymbolDef, ExecutorSymbolDef> {
+  using DefArgList = SPSExecutorSymbolDef::AsArgList;
+
+public:
+  static size_t size(const ExecutorSymbolDef &ESD) {
+    return DefArgList::size(ESD.getAddress(), ESD.getFlags());
+  }
+
+  static bool serialize(SPSOutputBuffer &BOB, const ExecutorSymbolDef &ESD) {
+    return DefArgList::serialize(BOB, ESD.getAddress(), ESD.getFlags());
+  }
+
+  static bool deserialize(SPSInputBuffer &BIB, ExecutorSymbolDef &ESD) {
+    ExecutorAddr Addr;
+    JITSymbolFlags Flags;
+    if (!DefArgList::deserialize(BIB, Addr, Flags))
+      return false;
+    ESD = ExecutorSymbolDef{Addr, Flags};
+    return true;
+  }
+};
+
+} // End namespace shared.
 } // End namespace orc.
 } // End namespace llvm.
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
index 8e336ca03eaf0..0c549bcbf0130 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
@@ -14,6 +14,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_SHARED_ORCRTBRIDGE_H
 
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
 #include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
 #include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
 
@@ -54,7 +55,7 @@ using SPSSimpleExecutorDylibManagerOpenSignature =
                                                  shared::SPSString, uint64_t);
 
 using SPSSimpleExecutorDylibManagerLookupSignature =
-    shared::SPSExpected<shared::SPSSequence<shared::SPSExecutorAddr>>(
+    shared::SPSExpected<shared::SPSSequence<shared::SPSExecutorSymbolDef>>(
         shared::SPSExecutorAddr, shared::SPSExecutorAddr,
         shared::SPSRemoteSymbolLookupSet);
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
index 7322674559c9e..e91d8d926d88c 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
@@ -19,6 +19,7 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/Shared/AllocationActions.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
 #include "llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h"
 #include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
@@ -113,7 +114,7 @@ struct PointerWrite {
 /// A handle used to represent a loaded dylib in the target process.
 using DylibHandle = ExecutorAddr;
 
-using LookupResult = std::vector<ExecutorAddr>;
+using LookupResult = std::vector<ExecutorSymbolDef>;
 
 } // end namespace tpctypes
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h
index fd4504cfb7fb0..00fd84e3ec142 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
 #include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
 #include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
@@ -37,8 +38,8 @@ class SimpleExecutorDylibManager : public ExecutorBootstrapService {
   virtual ~SimpleExecutorDylibManager();
 
   Expected<tpctypes::DylibHandle> open(const std::string &Path, uint64_t Mode);
-  Expected<std::vector<ExecutorAddr>> lookup(tpctypes::DylibHandle H,
-                                             const RemoteSymbolLookupSet &L);
+  Expected<std::vector<ExecutorSymbolDef>>
+  lookup(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &L);
 
   Error shutdown() override;
   void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
index b8969de549368..acd7e5a409fc5 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
@@ -45,7 +45,8 @@ Expected<std::unique_ptr<EPCDebugObjectRegistrar>> createJITLoaderGDBRegistrar(
   assert((*Result)[0].size() == 1 &&
          "Unexpected number of addresses in result");
 
-  return std::make_unique<EPCDebugObjectRegistrar>(ES, (*Result)[0][0]);
+  ExecutorAddr RegisterAddr = (*Result)[0][0].getAddress();
+  return std::make_unique<EPCDebugObjectRegistrar>(ES, RegisterAddr);
 }
 
 Error EPCDebugObjectRegistrar::registerDebugObject(ExecutorAddrRange TargetMem,
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
index 46e16a55c7e1d..2e2e7a9c5f32e 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
@@ -52,8 +52,8 @@ Error EPCDynamicLibrarySearchGenerator::tryToGenerate(
 
   auto ResultI = Result->front().begin();
   for (auto &KV : LookupSymbols) {
-    if (*ResultI)
-      NewSymbols[KV.first] = {*ResultI, JITSymbolFlags::Exported};
+    if (ResultI->getAddress())
+      NewSymbols[KV.first] = *ResultI;
     ++ResultI;
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
index e70749cdfab24..da185c80c6c7d 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
@@ -81,10 +81,11 @@ Expected<tpctypes::DylibHandle> EPCGenericDylibManager::open(StringRef Path,
   return H;
 }
 
-Expected<std::vector<ExecutorAddr>>
+Expected<std::vector<ExecutorSymbolDef>>
 EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
                                const SymbolLookupSet &Lookup) {
-  Expected<std::vector<ExecutorAddr>> Result((std::vector<ExecutorAddr>()));
+  Expected<std::vector<ExecutorSymbolDef>> Result(
+      (std::vector<ExecutorSymbolDef>()));
   if (auto Err =
           EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
               SAs.Lookup, Result, SAs.Instance, H, Lookup))
@@ -92,10 +93,11 @@ EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
   return Result;
 }
 
-Expected<std::vector<ExecutorAddr>>
+Expected<std::vector<ExecutorSymbolDef>>
 EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
                                const RemoteSymbolLookupSet &Lookup) {
-  Expected<std::vector<ExecutorAddr>> Result((std::vector<ExecutorAddr>()));
+  Expected<std::vector<ExecutorSymbolDef>> Result(
+      (std::vector<ExecutorSymbolDef>()));
   if (auto Err =
           EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
               SAs.Lookup, Result, SAs.Instance, H, Lookup))
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index ad27deff38d9e..f0c551cd77804 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -95,7 +95,7 @@ SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
 
   for (auto &Elem : Request) {
     sys::DynamicLibrary Dylib(Elem.Handle.toPtr<void *>());
-    R.push_back(std::vector<ExecutorAddr>());
+    R.push_back(std::vector<ExecutorSymbolDef>());
     for (auto &KV : Elem.Symbols) {
       auto &Sym = KV.first;
       std::string Tmp((*Sym).data() + !!GlobalManglingPrefix,
@@ -107,7 +107,9 @@ SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
         MissingSymbols.push_back(Sym);
         return make_error<SymbolsNotFound>(SSP, std::move(MissingSymbols));
       }
-      R.back().push_back(ExecutorAddr::fromPtr(Addr));
+      // FIXME: determine accurate JITSymbolFlags.
+      R.back().push_back(
+          {ExecutorAddr::fromPtr(Addr), JITSymbolFlags::Exported});
     }
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp b/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
index 75075c5c2a224..a369e1b533827 100644
--- a/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
@@ -73,7 +73,7 @@ Error lookupAndRecordAddrs(
                                    inconvertibleErrorCode());
 
   for (unsigned I = 0; I != Pairs.size(); ++I)
-    *Pairs[I].second = Result->front()[I];
+    *Pairs[I].second = Result->front()[I].getAddress();
 
   return Error::success();
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp
index cb11b68e27195..b7e256a826ca4 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp
@@ -40,10 +40,10 @@ SimpleExecutorDylibManager::open(const std::string &Path, uint64_t Mode) {
   return H;
 }
 
-Expected<std::vector<ExecutorAddr>>
+Expected<std::vector<ExecutorSymbolDef>>
 SimpleExecutorDylibManager::lookup(tpctypes::DylibHandle H,
                                    const RemoteSymbolLookupSet &L) {
-  std::vector<ExecutorAddr> Result;
+  std::vector<ExecutorSymbolDef> Result;
   auto DL = sys::DynamicLibrary(H.toPtr<void *>());
 
   for (const auto &E : L) {
@@ -52,7 +52,7 @@ SimpleExecutorDylibManager::lookup(tpctypes::DylibHandle H,
         return make_error<StringError>("Required address for empty symbol \"\"",
                                        inconvertibleErrorCode());
       else
-        Result.push_back(ExecutorAddr());
+        Result.push_back(ExecutorSymbolDef());
     } else {
 
       const char *DemangledSymName = E.Name.c_str();
@@ -70,7 +70,8 @@ SimpleExecutorDylibManager::lookup(tpctypes::DylibHandle H,
                                            DemangledSymName,
                                        inconvertibleErrorCode());
 
-      Result.push_back(ExecutorAddr::fromPtr(Addr));
+      // FIXME: determine accurate JITSymbolFlags.
+      Result.push_back({ExecutorAddr::fromPtr(Addr), JITSymbolFlags::Exported});
     }
   }
 
diff --git a/llvm/tools/lli/ForwardingMemoryManager.h b/llvm/tools/lli/ForwardingMemoryManager.h
index f1de7a153a270..2cc669953ee69 100644
--- a/llvm/tools/lli/ForwardingMemoryManager.h
+++ b/llvm/tools/lli/ForwardingMemoryManager.h
@@ -105,13 +105,14 @@ class RemoteResolver : public LegacyJITSymbolResolver {
   JITSymbol findSymbol(const std::string &Name) override {
     orc::RemoteSymbolLookupSet R;
     R.push_back({std::move(Name), false});
-    if (auto Addrs = DylibMgr.lookup(H, R)) {
-      if (Addrs->size() != 1)
+    if (auto Syms = DylibMgr.lookup(H, R)) {
+      if (Syms->size() != 1)
         return make_error<StringError>("Unexpected remote lookup result",
                                        inconvertibleErrorCode());
-      return JITSymbol(Addrs->front().getValue(), JITSymbolFlags::Exported);
+      return JITSymbol(Syms->front().getAddress().getValue(),
+                       Syms->front().getFlags());
     } else
-      return Addrs.takeError();
+      return Syms.takeError();
   }
 
   JITSymbol findSymbolInLogicalDylib(const std::string &Name) override {

From 6af713ae170c34f0561f19e594266ce2a2af343b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 4 Jan 2024 23:18:53 +0200
Subject: [PATCH 286/313] [LLD] [MinGW] Remove an unnecessary include of
 unistd.h. NFC. (#76953)

This was present since when the MinGW LLD frontend first was merged in
894dbbe8eb0e3d8fc7d8746a76d2380f0c2b6c0f, where it most probably was a
leftover from earlier evolution of the patch.
---
 lld/MinGW/Driver.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index 94f0ae7993e62..5ba1bf0e4b4e8 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -46,10 +46,6 @@
 #include "llvm/TargetParser/Triple.h"
 #include <optional>
 
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#endif
-
 using namespace lld;
 using namespace llvm::opt;
 using namespace llvm;

From dd047c5b64944bae830b9fecf53f8d11ff41386e Mon Sep 17 00:00:00 2001
From: madanial0 <118996571+madanial0@users.noreply.github.com>
Date: Thu, 4 Jan 2024 16:23:01 -0500
Subject: [PATCH 287/313] [Flang] remove setting lowerbound for non existing
 dimension in shift runtime test case (NFC) (#76990)

The shift2 array only has 1 dimension but the lower bound for a second
dimension is being set causing a seg fault on AIX.

Co-authored-by: Mark Danial <mark.danial@ibm.com>
---
 flang/unittests/Runtime/Transformational.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flang/unittests/Runtime/Transformational.cpp b/flang/unittests/Runtime/Transformational.cpp
index 942a20647ded7..5678ea2515775 100644
--- a/flang/unittests/Runtime/Transformational.cpp
+++ b/flang/unittests/Runtime/Transformational.cpp
@@ -250,7 +250,6 @@ TEST(Transformational, Shifts) {
   auto shift2{MakeArray<TypeCategory::Integer, 1>(
       std::vector<int>{2}, std::vector<std::int8_t>{1, -1})};
   shift2->GetDimension(0).SetLowerBound(-1); // shouldn't matter
-  shift2->GetDimension(1).SetLowerBound(2);
   RTNAME(Cshift)(result, *array, *shift2, 2, __FILE__, __LINE__);
   EXPECT_EQ(result.type(), array->type());
   EXPECT_EQ(result.rank(), 2);

From 7a05c0931f1a0ade9abc447c872a3cc7c8a37dd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 4 Jan 2024 16:03:21 +0200
Subject: [PATCH 288/313] [LLD] [COFF] Fix option name references in Config.h.
 NFC.

These options have been named /lldltocache: and
/lldltocachepolicy: since they were added in
052e855e2bea78dcfbb2807acee829b56d56a729; the comment was
wrong from the original commit.
---
 lld/COFF/Config.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 48c48cefe91d8..018f03b211e42 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -186,9 +186,9 @@ struct Configuration {
   // Used for /opt:lldltopartitions=N
   unsigned ltoPartitions = 1;
 
-  // Used for /opt:lldltocache=path
+  // Used for /lldltocache=path
   StringRef ltoCache;
-  // Used for /opt:lldltocachepolicy=policy
+  // Used for /lldltocachepolicy=policy
   llvm::CachePruningPolicy ltoCachePolicy;
 
   // Used for /opt:[no]ltodebugpassmanager

From cb7fe9ad4c3103d90c20d55819a9e69ab66ab3d0 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Thu, 4 Jan 2024 22:58:32 +0100
Subject: [PATCH 289/313] [ASAN][AMDGPU] Make address sanitizer checks more
 efficient for the divergent target. (#72247)

Address sanitizer checks for AMDGPU target in non-recovery mode aren't
quite efficient at the moment which can be illustrated with a program:
```
instr_before;
load ptr1;
instr_in_the_middle;
load ptr2;
instr_after;
```
ASAN generates the following instrumentation:
```
instr_before;
if (sanity_check_passed(ptr1))
  load ptr1;
  instr_in_the_middle;
  if (sanity_check_passed(ptr2))
     load ptr2;
     instr_after;
  else
     // ASAN report block 2
     __asan_report(ptr2); // wave terminates
     unreachable;
else
   // ASAN report block 1
  __asan_report(ptr1); // wave terminates
  unreachable;
```
Each sanitizer check is treated as a non-uniform condition (and this is
true because some lanes may pass the check and some don't). This results
in the program above: basically normal program flow is continued in
_then_ blocks. This way it allows lanes that pass all sanity checks to
complete the program and then the wave terminates at the first reporting
_else_ block. For each _else_ block it has to keep execmask and pointer
value to report error consuming tons (megatons!) of registers which are
live till the program end.

This patch changes the behavior on a failing sanity check: instead of
waiting when passing lanes reach program end report error and terminate
as soon as any lane has violated the sanity check. Sanity check
condition is treated uniform with this approach and the resulting
program looks much like ordinary CPU code:

```
instr_before;
if (any_lane_violated(sanity_check_passed(ptr1)))
  // ASAN report block 1
  __asan_report(ptr1); // abort the program
  unreachable;
load ptr1;
instr_in_the_middle;
if (any_lane_violated(sanity_check_passed(ptr2)))
  // ASAN report block 2
  __asan_report(ptr2); // abort the program
  unreachable;
load ptr2;
instr_after;
```

However it has to use a trick to pass structurizer and some later
passes: ASAN check is generated like in recovery mode but reporting
function aborts, that is standard _unreachable_ instruction isn't used:
```
...
if (any_lane_violated(sanity_check_passed(ptr1)))
  // ASAN report block 1
  __asan_report(ptr1); // abort the program
  // pretend we're going to continue the program
load ptr1;
...
```
This may create some undesirable effects:
1. Register allocator generates a lot of code for save/restore registers
for asan_report call. This may potentially bloat the code since we have
a report block for every accessed pointer.
2. Loop invariant code in report blocks is hoisted into a loop
preheader. I'm not sure but probably this can be solved using block
frequency information, but most likely this isn't a problem at all.

These problems are to be addressed later.

### Flattening address sanitizer check

In order to simplify divergent CFG this patch also changes the
instrumentation code from:

```
  uint64_t address = ptr;
  sbyte *shadow_address = MemToShadow(address);
  sbyte shadow_value = *shadow_address;
  if (shadow_value) {
    sbyte last_accessed_byte = (address & 7) + kAccessSize - 1;
    if (last_accessed_byte >= shadow_value) {
      ReportError(address, kAccessSize, kIsWrite);
      abort();
    }
  }
```
to
```
  uint64_t address = ptr;
  sbyte *shadow_address = MemToShadow(address);
  sbyte shadow_value = *shadow_address;

  sbyte last_accessed_byte = (address & 7) + kAccessSize - 1;
  if (shadow_value && last_accessed_byte >= shadow_value) {
    ReportError(address, kAccessSize, kIsWrite);
    abort();
  }
```
It saves one _if_ which really avoids very few instructions and their
latency can be hidden by the load from shadow memory.
---
 .../Instrumentation/AddressSanitizer.cpp      |  38 +++-
 .../asan_instrument_constant_address_space.ll |  74 ++++----
 .../asan_instrument_generic_address_space.ll  | 172 ++++++++++--------
 .../asan_instrument_global_address_space.ll   | 148 ++++++++-------
 4 files changed, 259 insertions(+), 173 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index afb0e6cd1548b..d4f5bf8c39356 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -174,6 +174,8 @@ const char kAsanAllocasUnpoison[] = "__asan_allocas_unpoison";
 
 const char kAMDGPUAddressSharedName[] = "llvm.amdgcn.is.shared";
 const char kAMDGPUAddressPrivateName[] = "llvm.amdgcn.is.private";
+const char kAMDGPUBallotName[] = "llvm.amdgcn.ballot.i64";
+const char kAMDGPUUnreachableName[] = "llvm.amdgcn.unreachable";
 
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
@@ -699,6 +701,8 @@ struct AddressSanitizer {
                                        Instruction *InsertBefore, Value *Addr,
                                        uint32_t TypeStoreSize, bool IsWrite,
                                        Value *SizeArgument);
+  Instruction *genAMDGPUReportBlock(IRBuilder<> &IRB, Value *Cond,
+                                    bool Recover);
   void instrumentUnusualSizeOrAlignment(Instruction *I,
                                         Instruction *InsertBefore, Value *Addr,
                                         TypeSize TypeStoreSize, bool IsWrite,
@@ -1721,6 +1725,30 @@ Instruction *AddressSanitizer::instrumentAMDGPUAddress(
   return InsertBefore;
 }
 
+Instruction *AddressSanitizer::genAMDGPUReportBlock(IRBuilder<> &IRB,
+                                                    Value *Cond, bool Recover) {
+  Module &M = *IRB.GetInsertBlock()->getModule();
+  Value *ReportCond = Cond;
+  if (!Recover) {
+    auto Ballot = M.getOrInsertFunction(kAMDGPUBallotName, IRB.getInt64Ty(),
+                                        IRB.getInt1Ty());
+    ReportCond = IRB.CreateIsNotNull(IRB.CreateCall(Ballot, {Cond}));
+  }
+
+  auto *Trm =
+      SplitBlockAndInsertIfThen(ReportCond, &*IRB.GetInsertPoint(), false,
+                                MDBuilder(*C).createBranchWeights(1, 100000));
+  Trm->getParent()->setName("asan.report");
+
+  if (Recover)
+    return Trm;
+
+  Trm = SplitBlockAndInsertIfThen(Cond, Trm, false);
+  IRB.SetInsertPoint(Trm);
+  return IRB.CreateCall(
+      M.getOrInsertFunction(kAMDGPUUnreachableName, IRB.getVoidTy()), {});
+}
+
 void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
                                          Instruction *InsertBefore, Value *Addr,
                                          MaybeAlign Alignment,
@@ -1772,7 +1800,15 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   size_t Granularity = 1ULL << Mapping.Scale;
   Instruction *CrashTerm = nullptr;
 
-  if (ClAlwaysSlowPath || (TypeStoreSize < 8 * Granularity)) {
+  bool GenSlowPath = (ClAlwaysSlowPath || (TypeStoreSize < 8 * Granularity));
+
+  if (TargetTriple.isAMDGCN()) {
+    if (GenSlowPath) {
+      auto *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeStoreSize);
+      Cmp = IRB.CreateAnd(Cmp, Cmp2);
+    }
+    CrashTerm = genAMDGPUReportBlock(IRB, Cmp, Recover);
+  } else if (GenSlowPath) {
     // We use branch weights for the slow path check, to indicate that the slow
     // path is rarely taken. This seems to be the case for SPEC benchmarks.
     Instruction *CheckTerm = SplitBlockAndInsertIfThen(
diff --git a/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_constant_address_space.ll b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_constant_address_space.ll
index 4ce337e2b68ef..cdead5b26c7e5 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_constant_address_space.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_constant_address_space.ll
@@ -18,17 +18,23 @@ define protected amdgpu_kernel void @constant_load(i64 %i) sanitize_address {
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF2:![0-9]+]]
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[TMP0]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sge i8 [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__asan_report_load4(i64 [[TMP0]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP0]], 7
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sge i8 [[TMP8]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label [[ASAN_REPORT:%.*]], label [[TMP15:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP14:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__asan_report_load4(i64 [[TMP0]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP14]]
+; CHECK:       14:
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
 ; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr addrspace(4) [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -42,19 +48,16 @@ define protected amdgpu_kernel void @constant_load(i64 %i) sanitize_address {
 ; RECOV-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; RECOV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; RECOV-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; RECOV-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
-; RECOV:       6:
-; RECOV-NEXT:    [[TMP7:%.*]] = and i64 [[TMP0]], 7
-; RECOV-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 3
-; RECOV-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i8
-; RECOV-NEXT:    [[TMP10:%.*]] = icmp sge i8 [[TMP9]], [[TMP4]]
-; RECOV-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12:%.*]]
-; RECOV:       11:
+; RECOV-NEXT:    [[TMP6:%.*]] = and i64 [[TMP0]], 7
+; RECOV-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 3
+; RECOV-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i8
+; RECOV-NEXT:    [[TMP9:%.*]] = icmp sge i8 [[TMP8]], [[TMP4]]
+; RECOV-NEXT:    [[TMP10:%.*]] = and i1 [[TMP5]], [[TMP9]]
+; RECOV-NEXT:    br i1 [[TMP10]], label [[ASAN_REPORT:%.*]], label [[TMP11:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_load4_noabort(i64 [[TMP0]]) #[[ATTR3:[0-9]+]]
-; RECOV-NEXT:    br label [[TMP12]]
-; RECOV:       12:
-; RECOV-NEXT:    br label [[TMP13]]
-; RECOV:       13:
+; RECOV-NEXT:    br label [[TMP11]]
+; RECOV:       11:
 ; RECOV-NEXT:    [[Q:%.*]] = load i32, ptr addrspace(4) [[A]], align 4
 ; RECOV-NEXT:    ret void
 ;
@@ -75,11 +78,18 @@ define protected amdgpu_kernel void @constant_load_8(i64 %i) sanitize_address {
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__asan_report_load8(i64 [[TMP0]]) #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[ASAN_REPORT:%.*]], label [[TMP10:%.*]], !prof [[PROF2]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP8:%.*]], label [[TMP9:%.*]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__asan_report_load8(i64 [[TMP0]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:       9:
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:       10:
 ; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr addrspace(4) [[A]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -93,11 +103,11 @@ define protected amdgpu_kernel void @constant_load_8(i64 %i) sanitize_address {
 ; RECOV-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; RECOV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; RECOV-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; RECOV-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; RECOV:       6:
+; RECOV-NEXT:    br i1 [[TMP5]], label [[ASAN_REPORT:%.*]], label [[TMP6:%.*]], !prof [[PROF2]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_load8_noabort(i64 [[TMP0]]) #[[ATTR3]]
-; RECOV-NEXT:    br label [[TMP7]]
-; RECOV:       7:
+; RECOV-NEXT:    br label [[TMP6]]
+; RECOV:       6:
 ; RECOV-NEXT:    [[Q:%.*]] = load i64, ptr addrspace(4) [[A]], align 8
 ; RECOV-NEXT:    ret void
 ;
diff --git a/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_generic_address_space.ll b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_generic_address_space.ll
index f86a5722c0006..cb37ba24f1c74 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_generic_address_space.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_generic_address_space.ll
@@ -13,7 +13,7 @@ define protected amdgpu_kernel void @generic_store(ptr addrspace(1) %p, i32 %i)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[Q]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP18:%.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP21:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[Q]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 3
@@ -21,19 +21,25 @@ define protected amdgpu_kernel void @generic_store(ptr addrspace(1) %p, i32 %i)
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP17:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP5]], 7
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 3
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i8
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp sge i8 [[TMP14]], [[TMP9]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP17]]
-; CHECK:       16:
-; CHECK-NEXT:    call void @__asan_report_store4(i64 [[TMP5]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    unreachable
-; CHECK:       17:
-; CHECK-NEXT:    br label [[TMP18]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP5]], 7
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp sge i8 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP16]], 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[ASAN_REPORT:%.*]], label [[TMP20:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP15]], label [[TMP18:%.*]], label [[TMP19:%.*]]
 ; CHECK:       18:
+; CHECK-NEXT:    call void @__asan_report_store4(i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    br label [[TMP20]]
+; CHECK:       20:
+; CHECK-NEXT:    br label [[TMP21]]
+; CHECK:       21:
 ; CHECK-NEXT:    store i32 0, ptr [[Q]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -45,7 +51,7 @@ define protected amdgpu_kernel void @generic_store(ptr addrspace(1) %p, i32 %i)
 ; RECOV-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[Q]])
 ; RECOV-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
 ; RECOV-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; RECOV-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP19:%.*]]
+; RECOV-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP17:%.*]]
 ; RECOV:       4:
 ; RECOV-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[Q]] to i64
 ; RECOV-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 3
@@ -53,21 +59,18 @@ define protected amdgpu_kernel void @generic_store(ptr addrspace(1) %p, i32 %i)
 ; RECOV-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; RECOV-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; RECOV-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP9]], 0
-; RECOV-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP18:%.*]], !prof [[PROF0:![0-9]+]]
-; RECOV:       11:
-; RECOV-NEXT:    [[TMP12:%.*]] = and i64 [[TMP5]], 7
-; RECOV-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 3
-; RECOV-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i8
-; RECOV-NEXT:    [[TMP15:%.*]] = icmp sge i8 [[TMP14]], [[TMP9]]
-; RECOV-NEXT:    br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP17:%.*]]
-; RECOV:       16:
+; RECOV-NEXT:    [[TMP11:%.*]] = and i64 [[TMP5]], 7
+; RECOV-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 3
+; RECOV-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
+; RECOV-NEXT:    [[TMP14:%.*]] = icmp sge i8 [[TMP13]], [[TMP9]]
+; RECOV-NEXT:    [[TMP15:%.*]] = and i1 [[TMP10]], [[TMP14]]
+; RECOV-NEXT:    br i1 [[TMP15]], label [[ASAN_REPORT:%.*]], label [[TMP16:%.*]], !prof [[PROF0:![0-9]+]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_store4_noabort(i64 [[TMP5]]) #[[ATTR3:[0-9]+]]
+; RECOV-NEXT:    br label [[TMP16]]
+; RECOV:       16:
 ; RECOV-NEXT:    br label [[TMP17]]
 ; RECOV:       17:
-; RECOV-NEXT:    br label [[TMP18]]
-; RECOV:       18:
-; RECOV-NEXT:    br label [[TMP19]]
-; RECOV:       19:
 ; RECOV-NEXT:    store i32 0, ptr [[Q]], align 4
 ; RECOV-NEXT:    ret void
 ;
@@ -87,7 +90,7 @@ define protected amdgpu_kernel void @generic_load(ptr addrspace(1) %p, i32 %i) s
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[Q]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP18:%.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP21:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[Q]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 3
@@ -95,19 +98,25 @@ define protected amdgpu_kernel void @generic_load(ptr addrspace(1) %p, i32 %i) s
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP5]], 7
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 3
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i8
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp sge i8 [[TMP14]], [[TMP9]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP17]]
-; CHECK:       16:
-; CHECK-NEXT:    call void @__asan_report_load4(i64 [[TMP5]]) #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       17:
-; CHECK-NEXT:    br label [[TMP18]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP5]], 7
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp sge i8 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP16]], 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[ASAN_REPORT:%.*]], label [[TMP20:%.*]], !prof [[PROF0]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP15]], label [[TMP18:%.*]], label [[TMP19:%.*]]
 ; CHECK:       18:
+; CHECK-NEXT:    call void @__asan_report_load4(i64 [[TMP5]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    br label [[TMP20]]
+; CHECK:       20:
+; CHECK-NEXT:    br label [[TMP21]]
+; CHECK:       21:
 ; CHECK-NEXT:    [[R:%.*]] = load i32, ptr [[Q]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -119,7 +128,7 @@ define protected amdgpu_kernel void @generic_load(ptr addrspace(1) %p, i32 %i) s
 ; RECOV-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[Q]])
 ; RECOV-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
 ; RECOV-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; RECOV-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP19:%.*]]
+; RECOV-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP17:%.*]]
 ; RECOV:       4:
 ; RECOV-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[Q]] to i64
 ; RECOV-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 3
@@ -127,21 +136,18 @@ define protected amdgpu_kernel void @generic_load(ptr addrspace(1) %p, i32 %i) s
 ; RECOV-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; RECOV-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; RECOV-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP9]], 0
-; RECOV-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP18:%.*]], !prof [[PROF0]]
-; RECOV:       11:
-; RECOV-NEXT:    [[TMP12:%.*]] = and i64 [[TMP5]], 7
-; RECOV-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 3
-; RECOV-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i8
-; RECOV-NEXT:    [[TMP15:%.*]] = icmp sge i8 [[TMP14]], [[TMP9]]
-; RECOV-NEXT:    br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP17:%.*]]
-; RECOV:       16:
+; RECOV-NEXT:    [[TMP11:%.*]] = and i64 [[TMP5]], 7
+; RECOV-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 3
+; RECOV-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
+; RECOV-NEXT:    [[TMP14:%.*]] = icmp sge i8 [[TMP13]], [[TMP9]]
+; RECOV-NEXT:    [[TMP15:%.*]] = and i1 [[TMP10]], [[TMP14]]
+; RECOV-NEXT:    br i1 [[TMP15]], label [[ASAN_REPORT:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_load4_noabort(i64 [[TMP5]]) #[[ATTR3]]
+; RECOV-NEXT:    br label [[TMP16]]
+; RECOV:       16:
 ; RECOV-NEXT:    br label [[TMP17]]
 ; RECOV:       17:
-; RECOV-NEXT:    br label [[TMP18]]
-; RECOV:       18:
-; RECOV-NEXT:    br label [[TMP19]]
-; RECOV:       19:
 ; RECOV-NEXT:    [[R:%.*]] = load i32, ptr [[Q]], align 4
 ; RECOV-NEXT:    ret void
 ;
@@ -161,7 +167,7 @@ define protected amdgpu_kernel void @generic_store_8(ptr addrspace(1) %p) saniti
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[Q]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP13:%.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP16:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[Q]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 3
@@ -169,13 +175,20 @@ define protected amdgpu_kernel void @generic_store_8(ptr addrspace(1) %p) saniti
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12:%.*]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__asan_report_store8(i64 [[TMP5]]) #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    br label [[TMP13]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label [[ASAN_REPORT:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP14:%.*]]
 ; CHECK:       13:
+; CHECK-NEXT:    call void @__asan_report_store8(i64 [[TMP5]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP14]]
+; CHECK:       14:
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
+; CHECK-NEXT:    br label [[TMP16]]
+; CHECK:       16:
 ; CHECK-NEXT:    store i64 0, ptr [[Q]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -187,7 +200,7 @@ define protected amdgpu_kernel void @generic_store_8(ptr addrspace(1) %p) saniti
 ; RECOV-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[Q]])
 ; RECOV-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
 ; RECOV-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; RECOV-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP13:%.*]]
+; RECOV-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP12:%.*]]
 ; RECOV:       4:
 ; RECOV-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[Q]] to i64
 ; RECOV-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 3
@@ -195,13 +208,13 @@ define protected amdgpu_kernel void @generic_store_8(ptr addrspace(1) %p) saniti
 ; RECOV-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; RECOV-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; RECOV-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP9]], 0
-; RECOV-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12:%.*]]
-; RECOV:       11:
+; RECOV-NEXT:    br i1 [[TMP10]], label [[ASAN_REPORT:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_store8_noabort(i64 [[TMP5]]) #[[ATTR3]]
+; RECOV-NEXT:    br label [[TMP11]]
+; RECOV:       11:
 ; RECOV-NEXT:    br label [[TMP12]]
 ; RECOV:       12:
-; RECOV-NEXT:    br label [[TMP13]]
-; RECOV:       13:
 ; RECOV-NEXT:    store i64 0, ptr [[Q]], align 8
 ; RECOV-NEXT:    ret void
 ;
@@ -220,7 +233,7 @@ define protected amdgpu_kernel void @generic_load_8(ptr addrspace(1) %p) sanitiz
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[Q]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP13:%.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP16:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[Q]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 3
@@ -228,13 +241,20 @@ define protected amdgpu_kernel void @generic_load_8(ptr addrspace(1) %p) sanitiz
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12:%.*]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__asan_report_load8(i64 [[TMP5]]) #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    br label [[TMP13]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label [[ASAN_REPORT:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP14:%.*]]
 ; CHECK:       13:
+; CHECK-NEXT:    call void @__asan_report_load8(i64 [[TMP5]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP14]]
+; CHECK:       14:
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
+; CHECK-NEXT:    br label [[TMP16]]
+; CHECK:       16:
 ; CHECK-NEXT:    [[R:%.*]] = load i64, ptr [[Q]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -246,7 +266,7 @@ define protected amdgpu_kernel void @generic_load_8(ptr addrspace(1) %p) sanitiz
 ; RECOV-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[Q]])
 ; RECOV-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
 ; RECOV-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; RECOV-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP13:%.*]]
+; RECOV-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP12:%.*]]
 ; RECOV:       4:
 ; RECOV-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[Q]] to i64
 ; RECOV-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 3
@@ -254,13 +274,13 @@ define protected amdgpu_kernel void @generic_load_8(ptr addrspace(1) %p) sanitiz
 ; RECOV-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; RECOV-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; RECOV-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP9]], 0
-; RECOV-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12:%.*]]
-; RECOV:       11:
+; RECOV-NEXT:    br i1 [[TMP10]], label [[ASAN_REPORT:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_load8_noabort(i64 [[TMP5]]) #[[ATTR3]]
+; RECOV-NEXT:    br label [[TMP11]]
+; RECOV:       11:
 ; RECOV-NEXT:    br label [[TMP12]]
 ; RECOV:       12:
-; RECOV-NEXT:    br label [[TMP13]]
-; RECOV:       13:
 ; RECOV-NEXT:    [[R:%.*]] = load i64, ptr [[Q]], align 8
 ; RECOV-NEXT:    ret void
 ;
diff --git a/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_global_address_space.ll b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_global_address_space.ll
index 6b39ff6ff8462..a954b173eb2aa 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_global_address_space.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_global_address_space.ll
@@ -14,17 +14,23 @@ define protected amdgpu_kernel void @global_store(ptr addrspace(1) %p, i32 %i) s
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[TMP0]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sge i8 [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__asan_report_store4(i64 [[TMP0]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP0]], 7
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sge i8 [[TMP8]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label [[ASAN_REPORT:%.*]], label [[TMP15:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP14:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__asan_report_store4(i64 [[TMP0]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP14]]
+; CHECK:       14:
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
 ; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -37,19 +43,16 @@ define protected amdgpu_kernel void @global_store(ptr addrspace(1) %p, i32 %i) s
 ; RECOV-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; RECOV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; RECOV-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; RECOV-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF0:![0-9]+]]
-; RECOV:       6:
-; RECOV-NEXT:    [[TMP7:%.*]] = and i64 [[TMP0]], 7
-; RECOV-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 3
-; RECOV-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i8
-; RECOV-NEXT:    [[TMP10:%.*]] = icmp sge i8 [[TMP9]], [[TMP4]]
-; RECOV-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12:%.*]]
-; RECOV:       11:
+; RECOV-NEXT:    [[TMP6:%.*]] = and i64 [[TMP0]], 7
+; RECOV-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 3
+; RECOV-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i8
+; RECOV-NEXT:    [[TMP9:%.*]] = icmp sge i8 [[TMP8]], [[TMP4]]
+; RECOV-NEXT:    [[TMP10:%.*]] = and i1 [[TMP5]], [[TMP9]]
+; RECOV-NEXT:    br i1 [[TMP10]], label [[ASAN_REPORT:%.*]], label [[TMP11:%.*]], !prof [[PROF0:![0-9]+]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_store4_noabort(i64 [[TMP0]]) #[[ATTR3:[0-9]+]]
-; RECOV-NEXT:    br label [[TMP12]]
-; RECOV:       12:
-; RECOV-NEXT:    br label [[TMP13]]
-; RECOV:       13:
+; RECOV-NEXT:    br label [[TMP11]]
+; RECOV:       11:
 ; RECOV-NEXT:    store i32 0, ptr addrspace(1) [[P]], align 4
 ; RECOV-NEXT:    ret void
 ;
@@ -69,17 +72,23 @@ define protected amdgpu_kernel void @global_load(ptr addrspace(1) %p, i32 %i) sa
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[TMP0]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sge i8 [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__asan_report_load4(i64 [[TMP0]]) #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP0]], 7
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sge i8 [[TMP8]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label [[ASAN_REPORT:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[TMP13:%.*]], label [[TMP14:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__asan_report_load4(i64 [[TMP0]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP14]]
+; CHECK:       14:
+; CHECK-NEXT:    br label [[TMP15]]
+; CHECK:       15:
 ; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -92,19 +101,16 @@ define protected amdgpu_kernel void @global_load(ptr addrspace(1) %p, i32 %i) sa
 ; RECOV-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; RECOV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; RECOV-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; RECOV-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
-; RECOV:       6:
-; RECOV-NEXT:    [[TMP7:%.*]] = and i64 [[TMP0]], 7
-; RECOV-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 3
-; RECOV-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i8
-; RECOV-NEXT:    [[TMP10:%.*]] = icmp sge i8 [[TMP9]], [[TMP4]]
-; RECOV-NEXT:    br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP12:%.*]]
-; RECOV:       11:
+; RECOV-NEXT:    [[TMP6:%.*]] = and i64 [[TMP0]], 7
+; RECOV-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 3
+; RECOV-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i8
+; RECOV-NEXT:    [[TMP9:%.*]] = icmp sge i8 [[TMP8]], [[TMP4]]
+; RECOV-NEXT:    [[TMP10:%.*]] = and i1 [[TMP5]], [[TMP9]]
+; RECOV-NEXT:    br i1 [[TMP10]], label [[ASAN_REPORT:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_load4_noabort(i64 [[TMP0]]) #[[ATTR3]]
-; RECOV-NEXT:    br label [[TMP12]]
-; RECOV:       12:
-; RECOV-NEXT:    br label [[TMP13]]
-; RECOV:       13:
+; RECOV-NEXT:    br label [[TMP11]]
+; RECOV:       11:
 ; RECOV-NEXT:    [[Q:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
 ; RECOV-NEXT:    ret void
 ;
@@ -124,11 +130,18 @@ define protected amdgpu_kernel void @global_store_8(ptr addrspace(1) %p) sanitiz
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__asan_report_store8(i64 [[TMP0]]) #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[ASAN_REPORT:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP8:%.*]], label [[TMP9:%.*]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__asan_report_store8(i64 [[TMP0]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:       9:
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:       10:
 ; CHECK-NEXT:    store i64 0, ptr addrspace(1) [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -141,11 +154,11 @@ define protected amdgpu_kernel void @global_store_8(ptr addrspace(1) %p) sanitiz
 ; RECOV-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; RECOV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; RECOV-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; RECOV-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; RECOV:       6:
+; RECOV-NEXT:    br i1 [[TMP5]], label [[ASAN_REPORT:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_store8_noabort(i64 [[TMP0]]) #[[ATTR3]]
-; RECOV-NEXT:    br label [[TMP7]]
-; RECOV:       7:
+; RECOV-NEXT:    br label [[TMP6]]
+; RECOV:       6:
 ; RECOV-NEXT:    store i64 0, ptr addrspace(1) [[P]], align 8
 ; RECOV-NEXT:    ret void
 ;
@@ -164,11 +177,18 @@ define protected amdgpu_kernel void @global_load_8(ptr addrspace(1) %p) sanitize
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__asan_report_load8(i64 [[TMP0]]) #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[ASAN_REPORT:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       asan.report:
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP8:%.*]], label [[TMP9:%.*]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__asan_report_load8(i64 [[TMP0]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:       9:
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:       10:
 ; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr addrspace(1) [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -181,11 +201,11 @@ define protected amdgpu_kernel void @global_load_8(ptr addrspace(1) %p) sanitize
 ; RECOV-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; RECOV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; RECOV-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; RECOV-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; RECOV:       6:
+; RECOV-NEXT:    br i1 [[TMP5]], label [[ASAN_REPORT:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; RECOV:       asan.report:
 ; RECOV-NEXT:    call void @__asan_report_load8_noabort(i64 [[TMP0]]) #[[ATTR3]]
-; RECOV-NEXT:    br label [[TMP7]]
-; RECOV:       7:
+; RECOV-NEXT:    br label [[TMP6]]
+; RECOV:       6:
 ; RECOV-NEXT:    [[Q:%.*]] = load i64, ptr addrspace(1) [[P]], align 8
 ; RECOV-NEXT:    ret void
 ;

From 4241e847072cb436bde0abc74f2ef446a01f29aa Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:09:31 -0800
Subject: [PATCH 290/313] [mlir][sparse] minor comment edits in sparsifier
 pipeline (#77000)

---
 .../SparseTensor/Pipelines/SparseTensorPipelines.cpp       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index ec74c1c2d54af..66362d5be4f19 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -32,7 +32,6 @@
 void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
                                           const SparsifierOptions &options) {
   // Rewrite named linalg ops into generic ops.
-
   pm.addNestedPass<func::FuncOp>(createLinalgGeneralizationPass());
 
   // Sparsification and bufferization mini-pipeline.
@@ -65,6 +64,8 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
     pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
   }
 
+  // Progressively lower to LLVM. Note that the convert-vector-to-llvm
+  // pass is repeated on purpose.
   // TODO(springerm): Add sparse support to the BufferDeallocation pass and add
   // it to this pipeline.
   pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
@@ -80,10 +81,7 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
   pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
   pm.addPass(createConvertMathToLibmPass());
   pm.addPass(createConvertComplexToLibmPass());
-
-  // Repeat convert-vector-to-llvm.
   pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
-
   pm.addPass(createConvertComplexToLLVMPass());
   pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
   pm.addPass(createConvertFuncToLLVMPass());
@@ -101,6 +99,7 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
     pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
   }
 
+  // Ensure all casts are realized.
   pm.addPass(createReconcileUnrealizedCastsPass());
 }
 

From 0414cf09f02239254c449b7c5a37428fecb943f7 Mon Sep 17 00:00:00 2001
From: ZijunZhaoCCK <88353225+ZijunZhaoCCK@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:25:13 -0800
Subject: [PATCH 291/313] Move nondiscard tests of ranges::contains() to the
 right place. (#76887)

nondiscard tests of ranges::contains() should be in
ranges.nodiscard_extensions files rather than nodiscard_extensions files
---
 .../diagnostics/nodiscard_extensions.compile.pass.cpp       | 3 ---
 .../test/libcxx/diagnostics/nodiscard_extensions.verify.cpp | 5 -----
 .../ranges.nodiscard_extensions.compile.pass.cpp            | 6 ++++++
 .../diagnostics/ranges.nodiscard_extensions.verify.cpp      | 4 ++++
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/libcxx/test/libcxx/diagnostics/nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/nodiscard_extensions.compile.pass.cpp
index 641fcd9233bc2..e9fab0c75a98e 100644
--- a/libcxx/test/libcxx/diagnostics/nodiscard_extensions.compile.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/nodiscard_extensions.compile.pass.cpp
@@ -45,9 +45,6 @@ void test_algorithms() {
 #if TEST_STD_VER >= 17
   std::clamp(2, 1, 3);
   std::clamp(2, 3, 1, std::greater<int>());
-#endif
-#if TEST_STD_VER >= 23
-  std::ranges::contains(arr, arr + 1, 1);
 #endif
   std::count_if(std::begin(arr), std::end(arr), P());
   std::count(std::begin(arr), std::end(arr), 1);
diff --git a/libcxx/test/libcxx/diagnostics/nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/nodiscard_extensions.verify.cpp
index 1e3f537f01ed6..d7a26d99e5223 100644
--- a/libcxx/test/libcxx/diagnostics/nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/nodiscard_extensions.verify.cpp
@@ -60,11 +60,6 @@ void test_algorithms() {
   std::clamp(2, 1, 3, std::greater<int>());
 #endif
 
-#if TEST_STD_VER >= 23
-  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
-  std::ranges::contains(arr, arr + 1, 1);
-#endif
-
   // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
   std::count_if(std::begin(arr), std::end(arr), P());
 
diff --git a/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.compile.pass.cpp
index 7e2d64b8c9b61..19e07b83079c7 100644
--- a/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.compile.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.compile.pass.cpp
@@ -15,6 +15,8 @@
 
 #include <algorithm>
 
+#include "test_macros.h"
+
 void test() {
   int range[1];
   int* iter = range;
@@ -28,6 +30,10 @@ void test() {
   std::ranges::binary_search(range, 1);
   std::ranges::binary_search(iter, iter, 1);
   std::ranges::clamp(1, 2, 3);
+#if TEST_STD_VER >= 23
+  std::ranges::contains(range, 1);
+  std::ranges::contains(iter, iter, 1);
+#endif
   std::ranges::count_if(range, pred);
   std::ranges::count_if(iter, iter, pred);
   std::ranges::count(range, 1);
diff --git a/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.verify.cpp
index f0a0e4889a760..5e45ad086cbd0 100644
--- a/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.verify.cpp
@@ -91,6 +91,10 @@ void test() {
   std::ranges::upper_bound(iter, iter, 1); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 
 #if TEST_STD_VER >= 23
+  std::ranges::contains(range, 1);
+  // expected-warning@-1{{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::contains(iter, iter, 1);
+  // expected-warning@-1{{ignoring return value of function declared with 'nodiscard' attribute}}
   std::ranges::fold_left(range, 0, std::plus());
   // expected-warning@-1{{ignoring return value of function declared with 'nodiscard' attribute}}
   std::ranges::fold_left(iter, iter, 0, std::plus());

From 241fe83704476f81e3438e32b6d988ea123e624d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 4 Jan 2024 22:53:18 +0000
Subject: [PATCH 292/313] [VPlan] Introduce ComputeReductionResult
 VPInstruction opcode. (#70253)

This patch introduces a new ComputeReductionResult opcode to compute the
final reduction result in the middle block. The code from fixReduction
has been moved to ComputeReductionResult, after some earlier cleanup
changes to model parts of fixReduction explicitly elsewhere as needed.

The recipe may be broken down further in the future.

Note that  the phi nodes to merge the reduction result from the trip
count check and the middle block, to be used as resume value for the
scalar remainder loop are also generated based on
ComputeReductionResult.

Once we have a VPValue for the reduction result, this can also be
modeled explicitly and moved out of the recipe.
---
 .../Vectorize/LoopVectorizationPlanner.h      |  16 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 351 +++++++-----------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   1 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |   5 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  82 ++++
 .../AArch64/scalable-strict-fadd.ll           |   2 +-
 .../LoopVectorize/AArch64/strict-fadd.ll      |   2 +-
 .../AArch64/sve-interleaved-accesses.ll       |   2 +-
 .../ARM/tail-fold-multiple-icmps.ll           |   2 +-
 .../epilog-vectorization-reductions.ll        |   4 +-
 ...-order-recurrence-sink-replicate-region.ll |   3 +-
 .../LoopVectorize/interleaved-accesses.ll     |   2 +-
 .../reduction-with-invariant-store.ll         |  16 +-
 .../LoopVectorize/vplan-printing.ll           |   7 +-
 14 files changed, 247 insertions(+), 248 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 150ab301e7fd2..cff72ae263d84 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -346,16 +346,20 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
-  /// Generate the IR code for the body of the vectorized loop according to the
-  /// best selected \p VF, \p UF and VPlan \p BestPlan.
+  /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
+  /// according to the best selected \p VF and  \p UF.
+  ///
   /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
   /// vectorization re-using plans for both the main and epilogue vector loops.
   /// It should be removed once the re-use issue has been fixed.
   /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
-  /// to re-use expansion results generated during main plan execution. Returns
-  /// a mapping of SCEVs to their expanded IR values. Note that this is a
-  /// temporary workaround needed due to the current epilogue handling.
-  DenseMap<const SCEV *, Value *>
+  /// to re-use expansion results generated during main plan execution.
+  ///
+  /// Returns a mapping of SCEVs to their expanded IR values and a mapping for
+  /// the reduction resume values. Note that this is a temporary workaround
+  /// needed due to the current epilogue handling.
+  std::pair<DenseMap<const SCEV *, Value *>,
+            DenseMap<const RecurrenceDescriptor *, Value *>>
   executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
               InnerLoopVectorizer &LB, DominatorTree *DT,
               bool IsEpilogueVectorization,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 961d3d3bb1931..10c068e3b5895 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -584,10 +584,6 @@ class InnerLoopVectorizer {
   /// able to vectorize with strict in-order reductions for the given RdxDesc.
   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
 
-  // Returns the resume value (bc.merge.rdx) for a reduction as
-  // generated by fixReduction.
-  PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
-
   /// Create a new phi node for the induction variable \p OrigPhi to resume
   /// iteration count in the scalar epilogue, from where the vectorized loop
   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
@@ -626,9 +622,6 @@ class InnerLoopVectorizer {
                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
                     VPlan &Plan, VPTransformState &State);
 
-  /// Handle all cross-iteration phis in the header.
-  void fixCrossIterationPHIs(VPTransformState &State);
-
   /// Create the exit value of first order recurrences in the middle block and
   /// update their users.
   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
@@ -1166,14 +1159,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
   }
 }
 
-PHINode *InnerLoopVectorizer::getReductionResumeValue(
-    const RecurrenceDescriptor &RdxDesc) {
-  auto It = ReductionResumeValues.find(&RdxDesc);
-  assert(It != ReductionResumeValues.end() &&
-         "Expected to find a resume value for the reduction.");
-  return It->second;
-}
-
 namespace llvm {
 
 // Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -3434,8 +3419,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
   // At this point every instruction in the original loop is widened to a
   // vector form. Now we need to fix the recurrences in the loop. These PHI
   // nodes are currently empty because we did not want to introduce cycles.
-  // This is the second stage of vectorizing recurrences.
-  fixCrossIterationPHIs(State);
+  // This is the second stage of vectorizing recurrences. Note that fixing
+  // reduction phis are already modeled in VPlan.
+  // TODO: Also model fixing fixed-order recurrence phis in VPlan.
+  VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
+  VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
+  for (VPRecipeBase &R : HeaderVPBB->phis()) {
+    if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+      fixFixedOrderRecurrence(FOR, State);
+  }
 
   // Forget the original basic block.
   PSE.getSE()->forgetLoop(OrigLoop);
@@ -3450,7 +3442,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
     for (PHINode &PN : Exit->phis())
       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
 
-  VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
+  VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
   if (Cost->requiresScalarEpilogue(VF.isVector())) {
     // No edge from the middle block to the unique exit block has been inserted
@@ -3503,27 +3495,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
                                VF.getKnownMinValue() * UF);
 }
 
-void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
-  // In order to support recurrences we need to be able to vectorize Phi nodes.
-  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
-  // stage #2: We now need to fix the recurrences by adding incoming edges to
-  // the currently empty PHI nodes. At this point every instruction in the
-  // original loop is widened to a vector form so we can use them to construct
-  // the incoming edges.
-  VPBasicBlock *Header =
-      State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
-
-  for (VPRecipeBase &R : Header->phis()) {
-    if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
-      fixReduction(ReductionPhi, State);
-  }
-
-  for (VPRecipeBase &R : Header->phis()) {
-    if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
-      fixFixedOrderRecurrence(FOR, State);
-  }
-}
-
 void InnerLoopVectorizer::fixFixedOrderRecurrence(
     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
   // This is the second phase of vectorizing first-order recurrences. An
@@ -3643,169 +3614,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
   Phi->setName("scalar.recur");
 }
 
-void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
-                                       VPTransformState &State) {
-  PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
-  // Get it's reduction variable descriptor.
-  assert(Legal->isReductionVariable(OrigPhi) &&
-         "Unable to find the reduction variable");
-  const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-
-  RecurKind RK = RdxDesc.getRecurrenceKind();
-  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
-  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-  if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
-    State.setDebugLocFrom(I->getDebugLoc());
-
-  VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
-
-  // Before each round, move the insertion point right between
-  // the PHIs and the values we are going to write.
-  // This allows us to write both PHINodes and the extractelement
-  // instructions.
-  Builder.SetInsertPoint(LoopMiddleBlock,
-                         LoopMiddleBlock->getFirstInsertionPt());
-
-  State.setDebugLocFrom(LoopExitInst->getDebugLoc());
-
-  Type *PhiTy = OrigPhi->getType();
-  // If tail is folded by masking, the vector value to leave the loop should be
-  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
-  // instead of the former. For an inloop reduction the reduction will already
-  // be predicated, and does not need to be handled here.
-  if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
-    VPValue *Def = nullptr;
-    for (VPUser *U : LoopExitInstDef->users()) {
-      auto *S = dyn_cast<VPInstruction>(U);
-      if (S && S->getOpcode() == Instruction::Select) {
-        Def = S;
-        break;
-      }
-    }
-    if (Def)
-      LoopExitInstDef = Def;
-  }
-
-  VectorParts RdxParts(UF);
-  for (unsigned Part = 0; Part < UF; ++Part)
-    RdxParts[Part] = State.get(LoopExitInstDef, Part);
-
-  // If the vector reduction can be performed in a smaller type, we truncate
-  // then extend the loop exit value to enable InstCombine to evaluate the
-  // entire expression in the smaller type.
-  if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
-    Builder.SetInsertPoint(LoopMiddleBlock,
-                           LoopMiddleBlock->getFirstInsertionPt());
-    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
-    }
-  }
-
-  // Reduce all of the unrolled parts into a single vector.
-  Value *ReducedPartRdx = RdxParts[0];
-  unsigned Op = RecurrenceDescriptor::getOpcode(RK);
-
-  // The middle block terminator has already been assigned a DebugLoc here (the
-  // OrigLoop's single latch terminator). We want the whole middle block to
-  // appear to execute on this line because: (a) it is all compiler generated,
-  // (b) these instructions are always executed after evaluating the latch
-  // conditional branch, and (c) other passes may add new predecessors which
-  // terminate on this line. This is the easiest way to ensure we don't
-  // accidentally cause an extra step back into the loop while debugging.
-  State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
-  if (PhiR->isOrdered())
-    ReducedPartRdx = RdxParts[UF - 1];
-  else {
-    // Floating-point operations should have some FMF to enable the reduction.
-    IRBuilderBase::FastMathFlagGuard FMFG(Builder);
-    Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
-    for (unsigned Part = 1; Part < UF; ++Part) {
-      Value *RdxPart = RdxParts[Part];
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        ReducedPartRdx = Builder.CreateBinOp(
-            (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
-      else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
-        ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
-                                       ReducedPartRdx, RdxPart);
-      else
-        ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
-    }
-  }
-
-  // Create the reduction after the loop. Note that inloop reductions create the
-  // target reduction in the loop using a Reduction recipe.
-  if (VF.isVector() && !PhiR->isInLoop()) {
-    ReducedPartRdx =
-        createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
-    // If the reduction can be performed in a smaller type, we need to extend
-    // the reduction to the wider type before we branch to the original loop.
-    if (PhiTy != RdxDesc.getRecurrenceType())
-      ReducedPartRdx = RdxDesc.isSigned()
-                           ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
-                           : Builder.CreateZExt(ReducedPartRdx, PhiTy);
-  }
-
-  PHINode *ResumePhi =
-      dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
-
-  // Create a phi node that merges control-flow from the backedge-taken check
-  // block and the middle block.
-  PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
-                                        LoopScalarPreHeader->getTerminator());
-
-  // If we are fixing reductions in the epilogue loop then we should already
-  // have created a bc.merge.rdx Phi after the main vector body. Ensure that
-  // we carry over the incoming values correctly.
-  for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
-    if (Incoming == LoopMiddleBlock)
-      BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
-    else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
-      BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
-                              Incoming);
-    else
-      BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
-  }
-
-  // Set the resume value for this reduction
-  ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
-
-  // If there were stores of the reduction value to a uniform memory address
-  // inside the loop, create the final store here.
-  if (StoreInst *SI = RdxDesc.IntermediateStore) {
-    StoreInst *NewSI =
-        Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
-                                   SI->getAlign());
-    propagateMetadata(NewSI, SI);
-
-    // If the reduction value is used in other places,
-    // then let the code below create PHI's for that.
-  }
-
-  // Now, we need to fix the users of the reduction variable
-  // inside and outside of the scalar remainder loop.
-
-  // We know that the loop is in LCSSA form. We need to update the PHI nodes
-  // in the exit blocks.  See comment on analogous loop in
-  // fixFixedOrderRecurrence for a more complete explaination of the logic.
-  if (!Cost->requiresScalarEpilogue(VF.isVector()))
-    for (PHINode &LCSSAPhi : LoopExitBlock->phis())
-      if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
-        LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
-        State.Plan->removeLiveOut(&LCSSAPhi);
-      }
-
-  // Fix the scalar loop reduction variable with the incoming reduction sum
-  // from the vector body and from the backedge value.
-  int IncomingEdgeBlockIdx =
-      OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
-  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-  // Pick the other block.
-  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-  OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-  OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-}
-
 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
   // The basic block and loop containing the predicated instruction.
   auto *PredBB = PredInst->getParent();
@@ -7609,7 +7417,65 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
   }
 }
 
-SCEV2ValueTy LoopVectorizationPlanner::executePlan(
+// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
+// create a merge phi node for it and add it to \p ReductionResumeValues.
+static void createAndCollectMergePhiForReduction(
+    VPInstruction *RedResult,
+    DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
+    VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
+  if (!RedResult ||
+      RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
+    return;
+
+  auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
+  const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+  Value *FinalValue =
+      State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
+  auto *ResumePhi =
+      dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
+  // TODO: bc.merge.rdx should not be created here, instead it should be
+  // modeled in VPlan.
+  BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
+  // Create a phi node that merges control-flow from the backedge-taken check
+  // block and the middle block.
+  auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
+                                     LoopScalarPreHeader->getTerminator());
+
+  // If we are fixing reductions in the epilogue loop then we should already
+  // have created a bc.merge.rdx Phi after the main vector body. Ensure that
+  // we carry over the incoming values correctly.
+  for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+    if (Incoming == LoopMiddleBlock)
+      BCBlockPhi->addIncoming(FinalValue, Incoming);
+    else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
+      BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
+                              Incoming);
+    else
+      BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+  }
+
+  auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+  // TODO: This fixup should instead be modeled in VPlan.
+  // Fix the scalar loop reduction variable with the incoming reduction sum
+  // from the vector body and from the backedge value.
+  int IncomingEdgeBlockIdx =
+      OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+  // Pick the other block.
+  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+  OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+  OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+
+  ReductionResumeValues[&RdxDesc] = BCBlockPhi;
+}
+
+std::pair<DenseMap<const SCEV *, Value *>,
+          DenseMap<const RecurrenceDescriptor *, Value *>>
+LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
@@ -7688,6 +7554,17 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
 
   BestVPlan.execute(&State);
 
+  // 2.5 Collect reduction resume values.
+  DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
+  auto *ExitVPBB =
+      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+  for (VPRecipeBase &R : *ExitVPBB) {
+    createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
+                                         ReductionResumeValues, State, OrigLoop,
+                                         State.CFG.VPBB2IRBB[ExitVPBB]);
+  }
+
+  // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
   MDNode *OrigLoopID = OrigLoop->getLoopID();
@@ -7721,7 +7598,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
 
   ILV.printDebugTracesAtEnd();
 
-  return State.ExpandedSCEVs;
+  return {State.ExpandedSCEVs, ReductionResumeValues};
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -9001,10 +8878,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 // to reductions, with one operand being vector and the other being the scalar
 // reduction chain. For other reductions, a select is introduced between the phi
 // and live-out recipes when folding the tail.
+//
+// A ComputeReductionResult recipe is added to the middle block, also for
+// in-loop reductions which compute their result in-loop, because generating
+// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
 void LoopVectorizationPlanner::adjustRecipesForReductions(
     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
     ElementCount MinVF) {
-  VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
+  VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
+  VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
   // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
   // sank outside of the loop would keep the same order as they had in the
   // original loop.
@@ -9044,15 +8926,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     for (VPRecipeBase *R : ReductionPHIList)
       R->moveBefore(*Header, Header->getFirstNonPhi());
 
-  SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
       continue;
-    InLoopReductionPhis.push_back(PhiR);
-  }
 
-  for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) {
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
     RecurKind Kind = RdxDesc.getRecurrenceKind();
     assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
@@ -9161,28 +9039,31 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     for (VPRecipeBase &R :
          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
-    if (!PhiR || PhiR->isInLoop())
+    if (!PhiR)
       continue;
 
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-    auto *NewExitingVPV = PhiR->getBackedgeValue();
     // If tail is folded by masking, introduce selects between the phi
     // and the live-out instruction of each reduction, at the beginning of the
     // dedicated latch block.
-    if (CM.foldTailByMasking()) {
+    auto *OrigExitingVPV = PhiR->getBackedgeValue();
+    auto *NewExitingVPV = PhiR->getBackedgeValue();
+    if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
       VPValue *Cond =
           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
-      VPValue *Red = PhiR->getBackedgeValue();
-      assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
+      assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
              "reduction recipe must be defined before latch");
       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
       std::optional<FastMathFlags> FMFs =
           PhiTy->isFloatingPointTy()
               ? std::make_optional(RdxDesc.getFastMathFlags())
               : std::nullopt;
-      NewExitingVPV = Builder.createSelect(Cond, Red, PhiR, {}, "", FMFs);
-      Red->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
-        return isa<VPLiveOut>(&U);
+      NewExitingVPV =
+          Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
+      OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
+        return isa<VPInstruction>(&U) &&
+               cast<VPInstruction>(&U)->getOpcode() ==
+                   VPInstruction::ComputeReductionResult;
       });
       if (PreferPredicatedReductionSelect ||
           TTI.preferPredicatedReductionSelect(
@@ -9190,6 +9071,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
               TargetTransformInfo::ReductionFlags()))
         PhiR->setOperand(1, NewExitingVPV);
     }
+
     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
     // entire expression in the smaller type.
@@ -9206,9 +9088,31 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
 
       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
       Extnd->insertAfter(Trunc);
-      NewExitingVPV->replaceAllUsesWith(Extnd);
-      Trunc->setOperand(0, NewExitingVPV);
+      if (PhiR->getOperand(1) == NewExitingVPV)
+        PhiR->setOperand(1, Extnd->getVPSingleValue());
+      NewExitingVPV = Extnd;
     }
+
+    // We want code in the middle block to appear to execute on the location of
+    // the scalar loop's latch terminator because: (a) it is all compiler
+    // generated, (b) these instructions are always executed after evaluating
+    // the latch conditional branch, and (c) other passes may add new
+    // predecessors which terminate on this line. This is the easiest way to
+    // ensure we don't accidentally cause an extra step back into the loop while
+    // debugging.
+    DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
+
+    // TODO: At the moment ComputeReductionResult also drives creation of the
+    // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
+    // even for in-loop reductions, until the reduction resume value handling is
+    // also modeled in VPlan.
+    auto *FinalReductionResult = new VPInstruction(
+        VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
+    cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
+        ->appendRecipe(FinalReductionResult);
+    OrigExitingVPV->replaceUsesWithIf(
+        FinalReductionResult,
+        [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
   }
 
   VPlanTransforms::clearReductionWrapFlags(*Plan);
@@ -10175,8 +10079,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
-        auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
-                                             BestMainPlan, MainILV, DT, true);
+        const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
+            EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;
 
         // Second pass vectorizes the epilogue and adjusts the control flow
@@ -10217,8 +10121,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           Value *ResumeV = nullptr;
           // TODO: Move setting of resume values to prepareToExecute.
           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
-            ResumeV = MainILV.getReductionResumeValue(
-                ReductionPhi->getRecurrenceDescriptor());
+            ResumeV = ReductionResumeValues
+                          .find(&ReductionPhi->getRecurrenceDescriptor())
+                          ->second;
           } else {
             // Create induction resume values for both widened pointer and
             // integer/fp inductions and update the start value of the induction
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1d7df9c9575af..b6e56c47c227f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -446,6 +446,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
     // ExitBB can be re-used for the exit block of the Plan.
     NewBB = State->CFG.ExitBB;
     State->CFG.PrevBB = NewBB;
+    State->Builder.SetInsertPoint(NewBB->getFirstNonPHI());
 
     // Update the branch instruction in the predecessor to branch to ExitBB.
     VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 7d33baac52c9e..9d279da758ec0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1061,7 +1061,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
     // Increment the canonical IV separately for each unrolled part.
     CanonicalIVIncrementForPart,
     BranchOnCount,
-    BranchOnCond
+    BranchOnCond,
+    ComputeReductionResult,
   };
 
 private:
@@ -3132,6 +3133,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
     return Rep->isUniform();
   if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
     return all_of(GEP->operands(), isUniformAfterVectorization);
+  if (auto *VPI = dyn_cast<VPInstruction>(Def))
+    return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
   return false;
 }
 } // end namespace vputils
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 76961629aeceb..1e5273bcd748e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>
 
@@ -401,6 +402,84 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
     return CondBr;
   }
+  case VPInstruction::ComputeReductionResult: {
+    if (Part != 0)
+      return State.get(this, 0);
+
+    // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
+    // and will be removed by breaking up the recipe further.
+    auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
+    auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+    // Get its reduction variable descriptor.
+    const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+    RecurKind RK = RdxDesc.getRecurrenceKind();
+
+    State.setDebugLocFrom(getDebugLoc());
+
+    VPValue *LoopExitingDef = getOperand(1);
+    Type *PhiTy = OrigPhi->getType();
+    VectorParts RdxParts(State.UF);
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      RdxParts[Part] = State.get(LoopExitingDef, Part);
+
+    // If the vector reduction can be performed in a smaller type, we truncate
+    // then extend the loop exit value to enable InstCombine to evaluate the
+    // entire expression in the smaller type.
+    // TODO: Handle this in truncateToMinBW.
+    if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
+      for (unsigned Part = 0; Part < State.UF; ++Part)
+        RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+    }
+    // Reduce all of the unrolled parts into a single vector.
+    Value *ReducedPartRdx = RdxParts[0];
+    unsigned Op = RecurrenceDescriptor::getOpcode(RK);
+
+    if (PhiR->isOrdered()) {
+      ReducedPartRdx = RdxParts[State.UF - 1];
+    } else {
+      // Floating-point operations should have some FMF to enable the reduction.
+      IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+      Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+      for (unsigned Part = 1; Part < State.UF; ++Part) {
+        Value *RdxPart = RdxParts[Part];
+        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+          ReducedPartRdx = Builder.CreateBinOp(
+              (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+        else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+          TrackingVH<Value> ReductionStartValue =
+              RdxDesc.getRecurrenceStartValue();
+          ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+                                         ReducedPartRdx, RdxPart);
+        } else
+          ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
+      }
+    }
+
+    // Create the reduction after the loop. Note that inloop reductions create
+    // the target reduction in the loop using a Reduction recipe.
+    if (State.VF.isVector() && !PhiR->isInLoop()) {
+      ReducedPartRdx =
+          createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
+      // If the reduction can be performed in a smaller type, we need to extend
+      // the reduction to the wider type before we branch to the original loop.
+      if (PhiTy != RdxDesc.getRecurrenceType())
+        ReducedPartRdx = RdxDesc.isSigned()
+                             ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
+                             : Builder.CreateZExt(ReducedPartRdx, PhiTy);
+    }
+
+    // If there were stores of the reduction value to a uniform memory address
+    // inside the loop, create the final store here.
+    if (StoreInst *SI = RdxDesc.IntermediateStore) {
+      auto *NewSI = Builder.CreateAlignedStore(
+          ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
+      propagateMetadata(NewSI, SI);
+    }
+
+    return ReducedPartRdx;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -477,6 +556,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::BranchOnCount:
     O << "branch-on-count";
     break;
+  case VPInstruction::ComputeReductionResult:
+    O << "compute-reduction-result";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index acbb6e86fe584..e51190bae6124 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -612,8 +612,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP14]])
 ; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP15]])
+; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP14]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index 33b5273217cdf..7c1247e9ebc8f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -285,8 +285,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[STRIDED2:.*]], %[[VEC_PHI2]]
 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
 ; CHECK-UNORDERED: middle.block
-; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]])
 ; CHECK-UNORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]])
 ; CHECK-UNORDERED: for.body
 ; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr
 ; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], {{.*}}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index f600a0d5877d3..fa8e35f4697e9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -821,8 +821,8 @@ define void @int_float_struct(%struct.IntFloat* nocapture readonly %p) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
index 038725db279bb..f58d864e1e147 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
@@ -26,8 +26,8 @@ define arm_aapcs_vfpcc i32 @minmaxval4(ptr nocapture readonly %x, ptr nocapture
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
index 03903d80cfd6e..4df5332a47d4c 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
@@ -313,8 +313,8 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -344,8 +344,8 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP13]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 060d28f030f72..833d55f09294e 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -215,9 +215,10 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:  EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %res = vp<[[SEL]]>
+; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 187eefbe9b595..3b9edb11c5da2 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -837,8 +837,8 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 2a2d55fb75c53..5584aa969367a 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -562,10 +562,10 @@ exit:                                             ; preds = %for.body
 define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
+; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst, align 4
 ;
 entry:
   br label %for.body
@@ -591,10 +591,10 @@ exit:
 define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
+; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst, align 4
 ;
 entry:
   br label %for.body
@@ -621,10 +621,10 @@ exit:
 define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst1, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst2, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
+; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst1, align 4
 ;
 entry:
   br label %for.body
@@ -650,10 +650,10 @@ exit:
 define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst1, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst2, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
+; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst1, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 52a18d7b365c0..79ed71a2685fb 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -137,9 +137,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %red.next.lcssa = ir<%red.next>
+; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_RES]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -185,6 +186,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ;
@@ -385,9 +387,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %muladd.lcssa = ir<%muladd>
+; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_RES]]>
 ; CHECK-NEXT:}
 
 entry:

From 665e46c2689cc4212345213db7d7e968b91dcc8b Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Thu, 4 Jan 2024 15:03:18 -0800
Subject: [PATCH 293/313] [llvm-profdata] Use semicolon as the delimiter for
 supplementary profiles. (#75080)

When merging instrFDO profiles with afdo profile as supplementary, instrFDO counters for static functions are stored with function's PGO name (with filename.cpp; prefix).
- This pull request fixes the delimiter used when a PGO function name is 'normalized' for AFDO look-up.
---
 .../profile/Linux/instrprof-instr-suppl.test  | 86 +++++++++++++++++++
 .../llvm-profdata/Inputs/FUnique.proftext     |  2 +-
 .../llvm-profdata/Inputs/NoFUnique.proftext   |  2 +-
 .../Inputs/flatten_instr.proftext             |  2 +-
 .../suppl-instr-with-sample-flatten.test      |  6 +-
 llvm/tools/llvm-profdata/llvm-profdata.cpp    | 17 ++--
 6 files changed, 101 insertions(+), 14 deletions(-)
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-instr-suppl.test

diff --git a/compiler-rt/test/profile/Linux/instrprof-instr-suppl.test b/compiler-rt/test/profile/Linux/instrprof-instr-suppl.test
new file mode 100644
index 0000000000000..10650a345ab4b
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-instr-suppl.test
@@ -0,0 +1,86 @@
+// This is a regression test for supplementary profiles.
+
+// What the test does:
+// - Generate raw profiles from an executable and convert it to indexed profiles.
+// - Merge indexed profiles with supplementary sample-pgo profiles
+// - Check that the block counters for function foo is scaled up.
+
+// REQUIRES: lld-available
+
+// Building the instrumented binary will fail because lld doesn't support
+// big-endian ELF for PPC (aka ABI 1).
+// ld.lld: error: /lib/../lib64/Scrt1.o: ABI version 1 is not supported
+// UNSUPPORTED: ppc && host-byteorder-big-endian
+
+// This compiler-rt test aims to have test coverage for the IRPGO name format
+// of local-linkage functions during raw profile generation. The C++ functions
+// are simple with little optimization space so test outputs are more stable.
+// On the other hand, LLVM tests (like tools/llvm-profdata/suppl-instr-with-sample-static-func.test
+// or other suppl* test under tools/llvm-profdata dir) are more suitable for
+// more sophisticated cases (e.g., pseudo hot functions or profiles with discriminiators, etc).
+
+// RUN: rm -rf %t && split-file %s %t && cd %t
+
+// Use clangxx_pgogen for IR level instrumentation for C++.
+// The test case is constructed such that `-funique-internal-linkage-names` is
+// not used in instrPGO but used in static function names in SamplePGO.
+// RUN: %clangxx_pgogen -fuse-ld=lld -O2 main.cpp -o main
+// RUN: env LLVM_PROFILE_FILE=main.profraw %run ./main
+// RUN: llvm-profdata merge main.profraw -o main.profdata
+
+// The function counters are not scaled up.
+// RUN: llvm-profdata show -all-functions -counts main.profdata | FileCheck %s --check-prefix=INSTR
+
+// The instrPGO profile counter of function foo should be scaled up. Note the
+// scaling factor of a function is computed based on instrPGO profiles and
+// invariant to samplePGO profile counters.
+// RUN: llvm-profdata merge -supplement-instr-with-sample=sampleprof.proftext \
+// RUN:               -suppl-min-size-threshold=0 -instr-prof-cold-threshold=1 \
+// RUN:               main.profdata -o merge.profdata
+// RUN: llvm-profdata show -all-functions -counts merge.profdata | FileCheck %s --check-prefix=SUPPL
+
+// INSTR: Counters:
+// INSTR:   main:
+// INSTR:     Counters: 1
+// INSTR:     Block counts: [1]
+// INSTR:   _Z3barv:
+// INSTR:     Counters: 1
+// INSTR:     Block counts: [2]
+// INSTR:   main.cpp;_ZL3foov:
+// INSTR:     Counters: 1
+// INSTR:     Block counts: [1]
+
+// INSTR: Functions shown: 3
+// INSTR: Total functions: 3
+
+// SUPPL: Counters:
+// SUPPL:   main:
+// SUPPL:     Counters: 1
+// SUPPL:     Block counts: [1]
+// SUPPL:   _Z3barv:
+// SUPPL:     Counters: 1
+// SUPPL:     Block counts: [2]
+// SUPPL:   main.cpp;_ZL3foov:
+// SUPPL:     Counters: 1
+// SUPPL:     Block counts: [3]
+
+//--- main.cpp
+
+// mark foo and bar as noinline so preinliner won't inlined them into main
+// before the instrumentation pass.
+__attribute__((noinline)) static void foo() {
+}
+
+__attribute__((noinline)) void bar() {
+}
+
+int main() {
+  foo();
+  bar();
+  bar();
+  return 0;
+}
+
+//--- sampleprof.proftext
+_ZL3foov.__uniq.23343505234642233139497840575431302970:5:5
+  1: 5
diff --git a/llvm/test/tools/llvm-profdata/Inputs/FUnique.proftext b/llvm/test/tools/llvm-profdata/Inputs/FUnique.proftext
index be2c27df9746c..da169b18201a2 100644
--- a/llvm/test/tools/llvm-profdata/Inputs/FUnique.proftext
+++ b/llvm/test/tools/llvm-profdata/Inputs/FUnique.proftext
@@ -18,7 +18,7 @@ main
 1
 0
 
-test.c:_ZL3foom.__uniq.276699478366846449772231447066107882794
+test.c;_ZL3foom.__uniq.276699478366846449772231447066107882794
 # Func Hash:
 1124680652115249575
 # Num Counters:
diff --git a/llvm/test/tools/llvm-profdata/Inputs/NoFUnique.proftext b/llvm/test/tools/llvm-profdata/Inputs/NoFUnique.proftext
index 7485a33875e11..a3df42f6c3d27 100644
--- a/llvm/test/tools/llvm-profdata/Inputs/NoFUnique.proftext
+++ b/llvm/test/tools/llvm-profdata/Inputs/NoFUnique.proftext
@@ -18,7 +18,7 @@ main
 1
 0
 
-test.c:_ZL3foom
+test.c;_ZL3foom
 # Func Hash:
 1124680652115249575
 # Num Counters:
diff --git a/llvm/test/tools/llvm-profdata/Inputs/flatten_instr.proftext b/llvm/test/tools/llvm-profdata/Inputs/flatten_instr.proftext
index 8afee8f67b3b2..e180099731512 100644
--- a/llvm/test/tools/llvm-profdata/Inputs/flatten_instr.proftext
+++ b/llvm/test/tools/llvm-profdata/Inputs/flatten_instr.proftext
@@ -14,7 +14,7 @@ foo
 40
 6000
 
-bar.cc:bar
+bar.cc;bar
 # Func Hash:
 2222
 # Num Counters:
diff --git a/llvm/test/tools/llvm-profdata/suppl-instr-with-sample-flatten.test b/llvm/test/tools/llvm-profdata/suppl-instr-with-sample-flatten.test
index 2c569b5cf3b3f..4a394d7bce5c5 100644
--- a/llvm/test/tools/llvm-profdata/suppl-instr-with-sample-flatten.test
+++ b/llvm/test/tools/llvm-profdata/suppl-instr-with-sample-flatten.test
@@ -7,11 +7,11 @@ RUN:     -supplement-instr-with-sample=%p/Inputs/flatten_sample.proftext \
 RUN:     %p/Inputs/flatten_instr.proftext -o %t
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FLATTEN
 
-FLATTEN:  bar.cc:bar:
-FLATTEN-NEXT:    Hash: 0x00000000000008ae
-FLATTEN-NEXT:    Counters: 10    <PseudoHot>
 FLATTEN:  foo:
 FLATTEN-NEXT:    Hash: 0x0000000000000457
 FLATTEN-NEXT:    Counters: 5
 FLATTEN-NEXT:    Block counts: [10000, 50, 2000, 40, 6000]
+FLATTEN:  bar.cc;bar:
+FLATTEN-NEXT:    Hash: 0x00000000000008ae
+FLATTEN-NEXT:    Counters: 10    <PseudoHot>
 FLATTEN-NOT:  goo:
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 12b81d411cfa9..77197d3bf9196 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -998,13 +998,14 @@ adjustInstrProfile(std::unique_ptr<WriterContext> &WC,
 
   auto buildStaticFuncMap = [&StaticFuncMap,
                              SampleProfileHasFUnique](const StringRef Name) {
-    std::string Prefixes[] = {".cpp:", "cc:", ".c:", ".hpp:", ".h:"};
+    std::string FilePrefixes[] = {".cpp", "cc", ".c", ".hpp", ".h"};
     size_t PrefixPos = StringRef::npos;
-    for (auto &Prefix : Prefixes) {
-      PrefixPos = Name.find_insensitive(Prefix);
+    for (auto &FilePrefix : FilePrefixes) {
+      std::string NamePrefix = FilePrefix + kGlobalIdentifierDelimiter;
+      PrefixPos = Name.find_insensitive(NamePrefix);
       if (PrefixPos == StringRef::npos)
         continue;
-      PrefixPos += Prefix.size();
+      PrefixPos += NamePrefix.size();
       break;
     }
 
@@ -1088,17 +1089,17 @@ adjustInstrProfile(std::unique_ptr<WriterContext> &WC,
   //
   // InstrProfile has two entries:
   //  foo
-  //  bar.cc:bar
+  //  bar.cc;bar
   //
   // After BuildMaxSampleMap, we should have the following in FlattenSampleMap:
   // {"foo", {1000, 5000}}
-  // {"bar.cc:bar", {11000, 30000}}
+  // {"bar.cc;bar", {11000, 30000}}
   //
   // foo's has an entry count of 1000, and max body count of 5000.
-  // bar.cc:bar has an entry count of 11000 (sum two callsites of 1000 and
+  // bar.cc;bar has an entry count of 11000 (sum two callsites of 1000 and
   // 10000), and max count of 30000 (from the callsite in line 8).
   //
-  // Note that goo's count will remain in bar.cc:bar() as it does not have an
+  // Note that goo's count will remain in bar.cc;bar() as it does not have an
   // entry in InstrProfile.
   llvm::StringMap<std::pair<uint64_t, uint64_t>> FlattenSampleMap;
   auto BuildMaxSampleMap = [&FlattenSampleMap, &StaticFuncMap,

From 786cf76f434d2691878067dedb8b1eb99472810b Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl@users.noreply.github.com>
Date: Thu, 4 Jan 2024 15:30:23 -0800
Subject: [PATCH 294/313] [dsymutil] Add support for inline DWARF source files.
 (#77016)

There was a strange and seemingly unncessary empty string optimization
in NonRelocatableStringPool that I had to remove in order to support
empty strings in the line_str string table, without unconditionally
forcing an empty string to be added to every debug_line_str table.
---
 .../llvm/CodeGen/NonRelocatableStringpool.h   |  3 +-
 llvm/lib/CodeGen/NonRelocatableStringpool.cpp |  3 --
 llvm/lib/DWARFLinker/DWARFStreamer.cpp        | 16 +++++++--
 .../tools/dsymutil/ARM/inline-source.test     | 20 +++++++++++
 llvm/test/tools/dsymutil/Inputs/inline.ll     | 34 +++++++++++++++++++
 5 files changed, 69 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/tools/dsymutil/ARM/inline-source.test
 create mode 100644 llvm/test/tools/dsymutil/Inputs/inline.ll

diff --git a/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h b/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
index fe07c70d85c59..3dc0731f5a04e 100644
--- a/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
+++ b/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
@@ -32,7 +32,7 @@ class NonRelocatableStringpool {
       bool PutEmptyString = false)
       : Translator(Translator) {
     if (PutEmptyString)
-      EmptyString = getEntry("");
+      getEntry("");
   }
 
   DwarfStringPoolEntryRef getEntry(StringRef S);
@@ -59,7 +59,6 @@ class NonRelocatableStringpool {
   MapTy Strings;
   uint64_t CurrentEndOffset = 0;
   unsigned NumEntries = 0;
-  DwarfStringPoolEntryRef EmptyString;
   std::function<StringRef(StringRef Input)> Translator;
 };
 
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index 7304bfef55cb7..e8391afb8e3f8 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -12,9 +12,6 @@
 namespace llvm {
 
 DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
-  if (S.empty() && !Strings.empty())
-    return EmptyString;
-
   if (Translator)
     S = Translator(S);
   auto I = Strings.insert({S, DwarfStringPoolEntry()});
diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
index cd649c328ed93..faa91364ec9c3 100644
--- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
@@ -859,21 +859,31 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable(
   for (auto Include : P.IncludeDirectories)
     emitLineTableString(P, Include, DebugStrPool, DebugLineStrPool);
 
+  bool InlineSources = any_of(P.FileNames, [](auto &File) {
+    auto s = dwarf::toString(File.Source);
+    return s && !**s;
+  });
+
   if (P.FileNames.empty()) {
     // file_name_entry_format_count (ubyte).
     MS->emitInt8(0);
     LineSectionSize += 1;
   } else {
     // file_name_entry_format_count (ubyte).
-    MS->emitInt8(2);
+    MS->emitInt8(2 + (InlineSources ? 1 : 0));
     LineSectionSize += 1;
 
     // file_name_entry_format (sequence of ULEB128 pairs).
+    auto StrForm = P.FileNames[0].Name.getForm();
     LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_path);
-    LineSectionSize += MS->emitULEB128IntValue(P.FileNames[0].Name.getForm());
+    LineSectionSize += MS->emitULEB128IntValue(StrForm);
 
     LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_directory_index);
     LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_FORM_data1);
+    if (InlineSources) {
+      LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_LLVM_source);
+      LineSectionSize += MS->emitULEB128IntValue(StrForm);
+    }
   }
 
   // file_names_count (ULEB128).
@@ -884,6 +894,8 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable(
     emitLineTableString(P, File.Name, DebugStrPool, DebugLineStrPool);
     MS->emitInt8(File.DirIdx);
     LineSectionSize += 1;
+    if (InlineSources)
+      emitLineTableString(P, File.Source, DebugStrPool, DebugLineStrPool);
   }
 }
 
diff --git a/llvm/test/tools/dsymutil/ARM/inline-source.test b/llvm/test/tools/dsymutil/ARM/inline-source.test
new file mode 100644
index 0000000000000..ec437e3de9008
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/inline-source.test
@@ -0,0 +1,20 @@
+# RUN: rm -rf %t
+# RUN: mkdir -p %t
+# RUN: llc -filetype=obj -mtriple arm64-apple-darwin %p/../Inputs/inline.ll -o %t/inline.o
+# RUN: dsymutil -f -oso-prepend-path=%t -y %s -o - | llvm-dwarfdump -debug-line - | FileCheck %s
+
+# Test inline source files.
+
+---
+triple:          'arm64-apple-darwin'
+objects:
+  - filename: inline.o
+    symbols:
+      - { sym: _f, objAddr: 0x0, binAddr: 0x1000, size: 0x12 }
+...
+
+# CHECK: .debug_line contents:
+# CHECK: file_names[  1]:
+# CHECK-NEXT: name: "inlined.c"
+# CHECK-NEXT: dir_index: 1
+# CHECK-NEXT: source: "{{.*}}This is inline source code.
\ No newline at end of file
diff --git a/llvm/test/tools/dsymutil/Inputs/inline.ll b/llvm/test/tools/dsymutil/Inputs/inline.ll
new file mode 100644
index 0000000000000..e7c9b7096ee5b
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/inline.ll
@@ -0,0 +1,34 @@
+; ModuleID = '/tmp/t.c'
+source_filename = "/tmp/t.c"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: noinline nounwind optnone ssp uwtable(sync)
+define void @f() #0 !dbg !9 {
+entry:
+  ret void, !dbg !14
+}
+
+attributes #0 = { noinline nounwind optnone ssp uwtable(sync) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 18.0.0git (git@github.com:llvm/llvm-project.git 29ee66f4a0967e43a035f147c960743c7b640f2f)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!1 = !DIFile(filename: "/INLINE/inlined.c", directory: "/Volumes/Data/llvm-project", checksumkind: CSK_MD5, checksum: "3183154a5cb31debe9a8e27ca500bc3c")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"uwtable", i32 1}
+!7 = !{i32 7, !"frame-pointer", i32 1}
+!8 = !{!"clang version 18.0.0git (git@github.com:llvm/llvm-project.git 29ee66f4a0967e43a035f147c960743c7b640f2f)"}
+!9 = distinct !DISubprogram(name: "f", scope: !10, file: !10, line: 2, type: !11, scopeLine: 2, spFlags: DISPFlagDefinition, unit: !0)
+!10 = !DIFile(filename: "/INLINE/inlined.c", directory: "", source: "void stop();
+void f() {
+  // This is inline source code.
+}
+")
+!11 = !DISubroutineType(types: !12)
+!12 = !{null}
+!14 = !DILocation(line: 4, column: 1, scope: !9)

From 9a2df55f47e4ec02a1efbf8efa776cfeed527df2 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Thu, 4 Jan 2024 16:13:57 -0800
Subject: [PATCH 295/313] [InstrProf] No linkage prefixes in IRPGO names
 (#76994)

Change the format of IRPGO counter names to
`[<filepath>;]<mangled-name>` which is computed by
`GlobalValue::getGlobalIdentifier()` to fix #74565.

In fe051934cbb0aaf25d960d7d45305135635d650b
(https://reviews.llvm.org/D156569) the format of IRPGO counter names was
changed to be `[<filepath>;]<linkage-name>` where `<linkage-name>` is
basically `F.getName()` with some prefix, e.g., `_` or `l_` on Mach-O
(yes, it is confusing that `<linkage-name>` is computed with
`Mangler().getNameWithPrefix()` while `<mangled-name>` is just
`F.getName()`). We discovered in #74565 that this causes some missed
import issues on some targets and #74008 is a partial fix.

Since `<mangled-name>` may not match the `<linkage-name>` on some
targets like Mach-O, we will need to post-process the output of
`llvm-profdata order` before passing to the linker via `-order_file`.

Profiles generated after fe051934cbb0aaf25d960d7d45305135635d650b will
become stale after this diff, but I think this is acceptable since that
patch landed after the LLVM 18 cut which hasn't been released yet.
---
 ...trprof-thinlto-indirect-call-promotion.cpp | 14 ----------
 lld/test/MachO/pgo-warn-mismatch.ll           |  2 +-
 llvm/lib/ProfileData/InstrProf.cpp            | 27 ++++++-------------
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  6 ++++-
 llvm/unittests/ProfileData/InstrProfTest.cpp  | 18 ++-----------
 5 files changed, 16 insertions(+), 51 deletions(-)

diff --git a/compiler-rt/test/profile/instrprof-thinlto-indirect-call-promotion.cpp b/compiler-rt/test/profile/instrprof-thinlto-indirect-call-promotion.cpp
index 860c054f69e1a..d361186cf26ac 100644
--- a/compiler-rt/test/profile/instrprof-thinlto-indirect-call-promotion.cpp
+++ b/compiler-rt/test/profile/instrprof-thinlto-indirect-call-promotion.cpp
@@ -28,20 +28,6 @@
 // LTO if default linker is GNU ld or gold anyway.
 // REQUIRES: lld-available
 
-// Test should fail where linkage-name and mangled-name diverges, see issue https://github.com/llvm/llvm-project/issues/74565).
-// Currently, this name divergence happens on Mach-O object file format, or on
-// many (but not all) 32-bit Windows systems.
-//
-// XFAIL: system-darwin
-//
-// Mark 32-bit Windows as UNSUPPORTED for now as opposed to XFAIL. This test
-// should fail on many (but not all) 32-bit Windows systems and succeed on the
-// rest. The flexibility in triple string parsing makes it tricky to capture
-// both sets accurately. i[3-9]86 specifies arch as Triple::ArchType::x86, (win32|windows)
-// specifies OS as Triple::OS::Win32
-//
-// UNSUPPORTED: target={{i.86.*windows.*}}
-
 // RUN: rm -rf %t && split-file %s %t && cd %t
 
 // Do setup work for all below tests.
diff --git a/lld/test/MachO/pgo-warn-mismatch.ll b/lld/test/MachO/pgo-warn-mismatch.ll
index 632adffb01fb3..365eeaccb4b40 100644
--- a/lld/test/MachO/pgo-warn-mismatch.ll
+++ b/lld/test/MachO/pgo-warn-mismatch.ll
@@ -24,7 +24,7 @@ entry:
 
 ;--- cs.proftext
 :csir
-_foo
+foo
 # Func Hash:
 2277602155505015273
 # Num Counters:
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 134a400e639c4..4264da8ad7514 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -27,7 +26,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -297,29 +295,20 @@ static StringRef getStrippedSourceFileName(const GlobalObject &GO) {
   return FileName;
 }
 
-// The PGO name has the format [<filepath>;]<linkage-name> where <filepath>; is
-// provided if linkage is local and <linkage-name> is the mangled function
-// name. The filepath is used to discriminate possibly identical function names.
-// ; is used because it is unlikely to be found in either <filepath> or
-// <linkage-name>.
+// The PGO name has the format [<filepath>;]<mangled-name> where <filepath>; is
+// provided if linkage is local and is used to discriminate possibly identical
+// mangled names. ";" is used because it is unlikely to be found in either
+// <filepath> or <mangled-name>.
 //
 // Older compilers used getPGOFuncName() which has the format
-// [<filepath>:]<function-name>. <filepath> is used to discriminate between
-// possibly identical function names when linkage is local and <function-name>
-// simply comes from F.getName(). This caused trouble for Objective-C functions
-// which commonly have :'s in their names. Also, since <function-name> is not
-// mangled, they cannot be passed to Mach-O linkers via -order_file. We still
-// need to compute this name to lookup functions from profiles built by older
-// compilers.
+// [<filepath>:]<mangled-name>. This caused trouble for Objective-C functions
+// which commonly have :'s in their names. We still need to compute this name to
+// lookup functions from profiles built by older compilers.
 static std::string
 getIRPGONameForGlobalObject(const GlobalObject &GO,
                             GlobalValue::LinkageTypes Linkage,
                             StringRef FileName) {
-  SmallString<64> Name;
-  // FIXME: Mangler's handling is kept outside of `getGlobalIdentifier` for now.
-  // For more details please check issue #74565.
-  Mangler().getNameWithPrefix(Name, &GO, /*CannotUsePrivateLabel=*/true);
-  return GlobalValue::getGlobalIdentifier(Name, Linkage, FileName);
+  return GlobalValue::getGlobalIdentifier(GO.getName(), Linkage, FileName);
 }
 
 static std::optional<std::string> lookupPGONameFromMetadata(MDNode *MD) {
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 77197d3bf9196..05e96f48cf124 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -3158,7 +3158,11 @@ static int order_main(int argc, const char *argv[]) {
   BalancedPartitioning BP(Config);
   BP.run(Nodes);
 
-  WithColor::note() << "# Ordered " << Nodes.size() << " functions\n";
+  OS << "# Ordered " << Nodes.size() << " functions\n";
+  OS << "# Warning: Mach-O may prefix symbols with \"_\" depending on the "
+        "linkage and this output does not take that into account. Some "
+        "post-processing may be required before passing to the linker via "
+        "-order_file.\n";
   for (auto &N : Nodes) {
     auto [Filename, ParsedFuncName] =
         getParsedIRPGOFuncName(Reader->getSymtab().getFuncOrVarName(N.Id));
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 6a71a975fbb12..8ffb68de7a2d2 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -542,22 +542,14 @@ TEST_F(InstrProfTest, test_memprof_merge) {
 TEST_F(InstrProfTest, test_irpgo_function_name) {
   LLVMContext Ctx;
   auto M = std::make_unique<Module>("MyModule.cpp", Ctx);
-  // Use Mach-O mangling so that non-private symbols get a `_` prefix.
-  M->setDataLayout(DataLayout("m:o"));
   auto *FTy = FunctionType::get(Type::getVoidTy(Ctx), /*isVarArg=*/false);
 
   std::vector<std::tuple<StringRef, Function::LinkageTypes, StringRef>> Data;
-  Data.emplace_back("ExternalFoo", Function::ExternalLinkage, "_ExternalFoo");
+  Data.emplace_back("ExternalFoo", Function::ExternalLinkage, "ExternalFoo");
   Data.emplace_back("InternalFoo", Function::InternalLinkage,
-                    "MyModule.cpp;_InternalFoo");
-  Data.emplace_back("PrivateFoo", Function::PrivateLinkage,
-                    "MyModule.cpp;l_PrivateFoo");
-  Data.emplace_back("WeakODRFoo", Function::WeakODRLinkage, "_WeakODRFoo");
-  // Test Objective-C symbols
+                    "MyModule.cpp;InternalFoo");
   Data.emplace_back("\01-[C dynamicFoo:]", Function::ExternalLinkage,
                     "-[C dynamicFoo:]");
-  Data.emplace_back("-<C directFoo:>", Function::ExternalLinkage,
-                    "_-<C directFoo:>");
   Data.emplace_back("\01-[C internalFoo:]", Function::InternalLinkage,
                     "MyModule.cpp;-[C internalFoo:]");
 
@@ -590,10 +582,6 @@ TEST_F(InstrProfTest, test_pgo_function_name) {
   Data.emplace_back("ExternalFoo", Function::ExternalLinkage, "ExternalFoo");
   Data.emplace_back("InternalFoo", Function::InternalLinkage,
                     "MyModule.cpp:InternalFoo");
-  Data.emplace_back("PrivateFoo", Function::PrivateLinkage,
-                    "MyModule.cpp:PrivateFoo");
-  Data.emplace_back("WeakODRFoo", Function::WeakODRLinkage, "WeakODRFoo");
-  // Test Objective-C symbols
   Data.emplace_back("\01-[C externalFoo:]", Function::ExternalLinkage,
                     "-[C externalFoo:]");
   Data.emplace_back("\01-[C internalFoo:]", Function::InternalLinkage,
@@ -611,8 +599,6 @@ TEST_F(InstrProfTest, test_pgo_function_name) {
 TEST_F(InstrProfTest, test_irpgo_read_deprecated_names) {
   LLVMContext Ctx;
   auto M = std::make_unique<Module>("MyModule.cpp", Ctx);
-  // Use Mach-O mangling so that non-private symbols get a `_` prefix.
-  M->setDataLayout(DataLayout("m:o"));
   auto *FTy = FunctionType::get(Type::getVoidTy(Ctx), /*isVarArg=*/false);
   auto *InternalFooF =
       Function::Create(FTy, Function::InternalLinkage, "InternalFoo", M.get());

From 3096353477a6802de9d4c74018c28f13e8ce1310 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 4 Jan 2024 16:33:06 -0800
Subject: [PATCH 296/313] test-release.sh: Add a CMake cache file for 3-stage
 release builds (#75903)

You can now pass the -use-cmake-cache option to test-release.sh and it
will use a predefined cache file for building the release. This will
make it easier to reproduce the builds and add other enhancements like
PGO or bolt optimizations.

---------

Co-authored-by: Konrad Kleine <konrad.kleine@posteo.de>
---
 .github/workflows/release-binaries.yml |  5 +-
 clang/cmake/caches/Release.cmake       | 41 ++++++++++++++++
 llvm/utils/release/test-release.sh     | 67 +++++++++++++++++++++++---
 3 files changed, 104 insertions(+), 9 deletions(-)
 create mode 100644 clang/cmake/caches/Release.cmake

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 4e3eaff97a878..4a4ba9dcc5ca3 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -93,8 +93,8 @@ jobs:
 
     - name: Build Clang
       run: |
-        cmake -G Ninja  -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE=Release -DCMAKE_ENABLE_ASSERTIONS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DLLVM_ENABLE_PROJECTS=clang -S llvm -B build
-        ninja -v -C build
+        cmake -G Ninja -C clang/cmake/caches/Release.cmake -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_POSITION_INDEPENDENT_CODE=ON -S llvm -B build
+        ninja -v -C build clang
 
 
   build-binaries:
@@ -152,6 +152,7 @@ jobs:
         -triple ${{ matrix.target.triple }} \
         -use-ninja \
         -no-checkout \
+        -use-cmake-cache \
         -no-test-suite \
         -configure-flags "-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache"
 
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
new file mode 100644
index 0000000000000..3ea65ce26296c
--- /dev/null
+++ b/clang/cmake/caches/Release.cmake
@@ -0,0 +1,41 @@
+# Plain options configure the first build.
+# BOOTSTRAP_* options configure the second build.
+# BOOTSTRAP_BOOTSTRAP_* options configure the third build.
+
+set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "")
+
+# Stage 1 Bootstrap Setup
+set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
+set(CLANG_BOOTSTRAP_TARGETS
+  clang
+  check-all
+  check-llvm
+  check-clang
+  test-suite
+  stage3
+  stage3-clang
+  stage3-check-all
+  stage3-check-llvm
+  stage3-check-clang
+  stage3-install
+  stage3-test-suite CACHE STRING "")
+
+# Stage 1 Options
+set(LLVM_ENABLE_PROJECTS "clang" CACHE STRING "")
+set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
+
+# Stage 2 Bootstrap Setup
+set(BOOTSTRAP_CLANG_ENABLE_BOOTSTRAP ON CACHE STRING "")
+set(BOOTSTRAP_CLANG_BOOTSTRAP_TARGETS
+  clang
+  check-all
+  check-llvm
+  check-clang CACHE STRING "")
+
+# Stage 2 Options
+set(BOOTSTRAP_LLVM_ENABLE_PROJECTS "clang" CACHE STRING "")
+set(BOOTSTRAP_LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
+
+# Stage 3 Options
+set(BOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
+set(BOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_PROJECTS "clang;lld;lldb;clang-tools-extra;bolt;polly;mlir;flang" CACHE STRING "")
diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
index f38134e3d22d8..544d4bfdd799c 100755
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -46,7 +46,7 @@ BuildDir="`pwd`"
 ExtraConfigureFlags=""
 ExportBranch=""
 git_ref=""
-
+do_cmake_cache="no"
 do_bolt="no"
 if [ "$System" = "Linux" ]; then
     case $Machine in
@@ -87,6 +87,7 @@ function usage() {
     echo " -no-mlir             Disable check-out & build MLIR"
     echo " -no-flang            Disable check-out & build Flang"
     echo " -silent-log          Don't output build logs to stdout"
+    echo " -use-cmake-cache     Build using a CMake cache file"
 }
 
 while [ $# -gt 0 ]; do
@@ -200,6 +201,9 @@ while [ $# -gt 0 ]; do
         -silent-log )
             do_silent_log="yes"
             ;;
+        -use-cmake-cache | --use-cmake-cache )
+            do_cmake_cache="yes"
+            ;;
         -help | --help | -h | --h | -\? )
             usage
             exit 0
@@ -328,6 +332,55 @@ Package=$Package-$Triple
 # Errors to be highlighted at the end are written to this file.
 echo -n > $LogDir/deferred_errors.log
 
+redir="/dev/stdout"
+if [ $do_silent_log == "yes" ]; then
+  echo "# Silencing build logs because of -silent-log flag..."
+  redir="/dev/null"
+fi
+
+
+function build_with_cmake_cache() {
+(
+  CMakeBuildDir=$BuildDir/build
+  SrcDir=$BuildDir/llvm-project/
+  InstallDir=$BuildDir/install
+
+  rm -rf $CMakeBuildDir
+
+  # FIXME: Would be nice if the commands were echoed to the log file too.
+  set -x
+
+  env CC="$c_compiler" CXX="$cxx_compiler" \
+  cmake -G "$generator" -B $CMakeBuildDir -S $SrcDir/llvm \
+        -C $SrcDir/clang/cmake/caches/Release.cmake \
+	-DCLANG_BOOTSTRAP_PASSTHROUGH="CMAKE_POSITION_INDEPENDENT_CODE;LLVM_LIT_ARGS" \
+        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+        -DLLVM_LIT_ARGS="-j $NumJobs $LitVerbose" \
+        $ExtraConfigureFlags
+        2>&1 | tee $LogDir/llvm.configure-$Flavor.log
+
+  ${MAKE} $J_ARG $Verbose -C $CMakeBuildDir stage3-check-all \
+          2>&1 | tee $LogDir/llvm.make-$Flavor.log > $redir
+
+  DESTDIR="${InstallDir}" \
+  ${MAKE} -C $CMakeBuildDir stage3-install \
+          2>&1 | tee $LogDir/llvm.install-$Flavor.log > $redir
+
+ mkdir -p $BuildDir/Release
+ pushd $BuildDir/Release
+ mv $InstallDir/usr/local $Package
+ if [ "$use_gzip" = "yes" ]; then
+    tar cf - $Package | gzip -9c > $BuildDir/$Package.tar.gz
+  else
+    tar cf - $Package | xz -9ce -T $NumJobs > $BuildDir/$Package.tar.xz
+  fi
+  mv $Package $InstallDir/usr/local
+  popd
+) 2>&1 | tee $LogDir/testing.$Release-$RC.log
+
+  exit 0
+}
+
 function deferred_error() {
   Phase="$1"
   Flavor="$2"
@@ -485,12 +538,6 @@ function build_llvmCore() {
       fi
     fi
 
-    redir="/dev/stdout"
-    if [ $do_silent_log == "yes" ]; then
-      echo "# Silencing build logs because of -silent-log flag..."
-      redir="/dev/null"
-    fi
-
     cd $ObjDir
     echo "# Compiling llvm $Release-$RC $Flavor"
     echo "# ${MAKE} $J_ARG $Verbose"
@@ -600,7 +647,13 @@ if [ $do_test_suite = "yes" ]; then
   mkdir -p $TestSuiteBuildDir
 fi
 
+if [ "$do_cmake_cache" = "yes" ]; then
+  build_with_cmake_cache
+  exit 0
+fi
+
 (
+
 Flavors="Release"
 if [ "$do_debug" = "yes" ]; then
     Flavors="Debug $Flavors"

From 71ec30132b74ba6974cece53f2a00f2749cf95f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 4 Jan 2024 16:33:20 -0800
Subject: [PATCH 297/313] [mlir][openacc] Add device_type support for data
 operation (#76126)

Following #75864, this patch adds device_type support to the data
operation on the async and wait operands and attributes.
---
 flang/lib/Lower/OpenACC.cpp                   | 125 ++++++++++++------
 flang/test/Lower/OpenACC/acc-data.f90         |   8 +-
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |  47 +++++--
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |  43 +++++-
 mlir/test/Dialect/OpenACC/ops.mlir            |   8 +-
 5 files changed, 176 insertions(+), 55 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index ecf70818c4ac0..d10e56e5d1177 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1464,6 +1464,24 @@ static void genAsyncClause(Fortran::lower::AbstractConverter &converter,
   }
 }
 
+static void
+genAsyncClause(Fortran::lower::AbstractConverter &converter,
+               const Fortran::parser::AccClause::Async *asyncClause,
+               llvm::SmallVector<mlir::Value> &async,
+               llvm::SmallVector<mlir::Attribute> &asyncDeviceTypes,
+               llvm::SmallVector<mlir::Attribute> &asyncOnlyDeviceTypes,
+               mlir::acc::DeviceTypeAttr deviceTypeAttr,
+               Fortran::lower::StatementContext &stmtCtx) {
+  const auto &asyncClauseValue = asyncClause->v;
+  if (asyncClauseValue) { // async has a value.
+    async.push_back(fir::getBase(converter.genExprValue(
+        *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx)));
+    asyncDeviceTypes.push_back(deviceTypeAttr);
+  } else {
+    asyncOnlyDeviceTypes.push_back(deviceTypeAttr);
+  }
+}
+
 static mlir::acc::DeviceType
 getDeviceType(Fortran::parser::AccDeviceTypeExpr::Device device) {
   switch (device) {
@@ -1533,6 +1551,39 @@ static void genWaitClause(Fortran::lower::AbstractConverter &converter,
   }
 }
 
+static void
+genWaitClause(Fortran::lower::AbstractConverter &converter,
+              const Fortran::parser::AccClause::Wait *waitClause,
+              llvm::SmallVector<mlir::Value> &waitOperands,
+              llvm::SmallVector<mlir::Attribute> &waitOperandsDeviceTypes,
+              llvm::SmallVector<mlir::Attribute> &waitOnlyDeviceTypes,
+              llvm::SmallVector<int32_t> &waitOperandsSegments,
+              mlir::Value &waitDevnum, mlir::acc::DeviceTypeAttr deviceTypeAttr,
+              Fortran::lower::StatementContext &stmtCtx) {
+  const auto &waitClauseValue = waitClause->v;
+  if (waitClauseValue) { // wait has a value.
+    const Fortran::parser::AccWaitArgument &waitArg = *waitClauseValue;
+    const auto &waitList =
+        std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
+    auto crtWaitOperands = waitOperands.size();
+    for (const Fortran::parser::ScalarIntExpr &value : waitList) {
+      waitOperands.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(value), stmtCtx)));
+    }
+    waitOperandsDeviceTypes.push_back(deviceTypeAttr);
+    waitOperandsSegments.push_back(waitOperands.size() - crtWaitOperands);
+
+    // TODO: move to device_type model.
+    const auto &waitDevnumValue =
+        std::get<std::optional<Fortran::parser::ScalarIntExpr>>(waitArg.t);
+    if (waitDevnumValue)
+      waitDevnum = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(*waitDevnumValue), stmtCtx));
+  } else {
+    waitOnlyDeviceTypes.push_back(deviceTypeAttr);
+  }
+}
+
 static mlir::acc::LoopOp
 createLoopOp(Fortran::lower::AbstractConverter &converter,
              mlir::Location currentLocation,
@@ -1795,6 +1846,7 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
       firstprivateOperands;
   llvm::SmallVector<mlir::Attribute> privatizations, firstPrivatizations,
       reductionRecipes;
+  mlir::Value waitDevnum; // TODO not yet implemented on compute op.
 
   // Self clause has optional values but can be present with
   // no value as well. When there is no value, the op has an attribute to
@@ -1818,31 +1870,14 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *asyncClause =
             std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
-      const auto &asyncClauseValue = asyncClause->v;
-      if (asyncClauseValue) { // async has a value.
-        async.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx)));
-        asyncDeviceTypes.push_back(crtDeviceTypeAttr);
-      } else {
-        asyncOnlyDeviceTypes.push_back(crtDeviceTypeAttr);
-      }
+      genAsyncClause(converter, asyncClause, async, asyncDeviceTypes,
+                     asyncOnlyDeviceTypes, crtDeviceTypeAttr, stmtCtx);
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
-      const auto &waitClauseValue = waitClause->v;
-      if (waitClauseValue) { // wait has a value.
-        const Fortran::parser::AccWaitArgument &waitArg = *waitClauseValue;
-        const auto &waitList =
-            std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
-        auto crtWaitOperands = waitOperands.size();
-        for (const Fortran::parser::ScalarIntExpr &value : waitList) {
-          waitOperands.push_back(fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(value), stmtCtx)));
-        }
-        waitOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
-        waitOperandsSegments.push_back(waitOperands.size() - crtWaitOperands);
-      } else {
-        waitOnlyDeviceTypes.push_back(crtDeviceTypeAttr);
-      }
+      genWaitClause(converter, waitClause, waitOperands,
+                    waitOperandsDeviceTypes, waitOnlyDeviceTypes,
+                    waitOperandsSegments, waitDevnum, crtDeviceTypeAttr,
+                    stmtCtx);
     } else if (const auto *numGangsClause =
                    std::get_if<Fortran::parser::AccClause::NumGangs>(
                        &clause.u)) {
@@ -2126,21 +2161,24 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
                          Fortran::semantics::SemanticsContext &semanticsContext,
                          Fortran::lower::StatementContext &stmtCtx,
                          const Fortran::parser::AccClauseList &accClauseList) {
-  mlir::Value ifCond, async, waitDevnum;
+  mlir::Value ifCond, waitDevnum;
   llvm::SmallVector<mlir::Value> attachEntryOperands, createEntryOperands,
-      copyEntryOperands, copyoutEntryOperands, dataClauseOperands, waitOperands;
-
-  // Async and wait have an optional value but can be present with
-  // no value as well. When there is no value, the op has an attribute to
-  // represent the clause.
-  bool addAsyncAttr = false;
-  bool addWaitAttr = false;
+      copyEntryOperands, copyoutEntryOperands, dataClauseOperands, waitOperands,
+      async;
+  llvm::SmallVector<mlir::Attribute> asyncDeviceTypes, asyncOnlyDeviceTypes,
+      waitOperandsDeviceTypes, waitOnlyDeviceTypes;
+  llvm::SmallVector<int32_t> waitOperandsSegments;
 
   bool hasDefaultNone = false;
   bool hasDefaultPresent = false;
 
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
+  // device_type attribute is set to `none` until a device_type clause is
+  // encountered.
+  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None);
+
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separately as clauses can appear
   // more than once.
@@ -2221,11 +2259,14 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
                                  dataClauseOperands.end());
     } else if (const auto *asyncClause =
                    std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
-      genAsyncClause(converter, asyncClause, async, addAsyncAttr, stmtCtx);
+      genAsyncClause(converter, asyncClause, async, asyncDeviceTypes,
+                     asyncOnlyDeviceTypes, crtDeviceTypeAttr, stmtCtx);
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
-      genWaitClause(converter, waitClause, waitOperands, waitDevnum,
-                    addWaitAttr, stmtCtx);
+      genWaitClause(converter, waitClause, waitOperands,
+                    waitOperandsDeviceTypes, waitOnlyDeviceTypes,
+                    waitOperandsSegments, waitDevnum, crtDeviceTypeAttr,
+                    stmtCtx);
     } else if(const auto *defaultClause = 
                   std::get_if<Fortran::parser::AccClause::Default>(&clause.u)) {
       if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_none)
@@ -2239,7 +2280,7 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> operands;
   llvm::SmallVector<int32_t> operandSegments;
   addOperand(operands, operandSegments, ifCond);
-  addOperand(operands, operandSegments, async);
+  addOperands(operands, operandSegments, async);
   addOperand(operands, operandSegments, waitDevnum);
   addOperands(operands, operandSegments, waitOperands);
   addOperands(operands, operandSegments, dataClauseOperands);
@@ -2250,8 +2291,18 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
   auto dataOp = createRegionOp<mlir::acc::DataOp, mlir::acc::TerminatorOp>(
       builder, currentLocation, eval, operands, operandSegments);
 
-  dataOp.setAsyncAttr(addAsyncAttr);
-  dataOp.setWaitAttr(addWaitAttr);
+  if (!asyncDeviceTypes.empty())
+    dataOp.setAsyncDeviceTypeAttr(builder.getArrayAttr(asyncDeviceTypes));
+  if (!asyncOnlyDeviceTypes.empty())
+    dataOp.setAsyncOnlyAttr(builder.getArrayAttr(asyncOnlyDeviceTypes));
+  if (!waitOperandsDeviceTypes.empty())
+    dataOp.setWaitOperandsDeviceTypeAttr(
+        builder.getArrayAttr(waitOperandsDeviceTypes));
+  if (!waitOperandsSegments.empty())
+    dataOp.setWaitOperandsSegmentsAttr(
+        builder.getDenseI32ArrayAttr(waitOperandsSegments));
+  if (!waitOnlyDeviceTypes.empty())
+    dataOp.setWaitOnlyAttr(builder.getArrayAttr(waitOnlyDeviceTypes));
 
   if (hasDefaultNone)
     dataOp.setDefaultAttr(mlir::acc::ClauseDefaultValue::None);
diff --git a/flang/test/Lower/OpenACC/acc-data.f90 b/flang/test/Lower/OpenACC/acc-data.f90
index a6572e1470760..75ffd1fc3fcab 100644
--- a/flang/test/Lower/OpenACC/acc-data.f90
+++ b/flang/test/Lower/OpenACC/acc-data.f90
@@ -153,7 +153,7 @@ subroutine acc_data
   !$acc end data
 
 ! CHECK: acc.data dataOperands(%{{.*}}) {
-! CHECK: } attributes {asyncAttr}
+! CHECK: } attributes {asyncOnly = [#acc.device_type<none>]}
 
   !$acc data present(a) async(1)
   !$acc end data
@@ -165,18 +165,18 @@ subroutine acc_data
   !$acc end data
 
 ! CHECK: acc.data dataOperands(%{{.*}}) {
-! CHECK: } attributes {waitAttr}
+! CHECK: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc data present(a) wait(1)
   !$acc end data
 
-! CHECK: acc.data dataOperands(%{{.*}}) wait(%{{.*}} : i32) {
+! CHECK: acc.data dataOperands(%{{.*}}) wait({%{{.*}} : i32}) {
 ! CHECK: }{{$}}
 
   !$acc data present(a) wait(devnum: 0: 1)
   !$acc end data
 
-! CHECK: acc.data dataOperands(%{{.*}}) wait_devnum(%{{.*}} : i32) wait(%{{.*}} : i32) {
+! CHECK: acc.data dataOperands(%{{.*}}) wait_devnum(%{{.*}} : i32) wait({%{{.*}} : i32}) {
 ! CHECK: }{{$}}
 
   !$acc data default(none)
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 4312bd4de1bd4..1dd83e933034a 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -1236,13 +1236,16 @@ def OpenACC_DataOp : OpenACC_Op<"data",
 
 
   let arguments = (ins Optional<I1>:$ifCond,
-                       Optional<IntOrIndex>:$async,
-                       UnitAttr:$asyncAttr,
-                       Optional<IntOrIndex>:$waitDevnum,
-                       Variadic<IntOrIndex>:$waitOperands,
-                       UnitAttr:$waitAttr,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
+      Variadic<IntOrIndex>:$async,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+      Optional<IntOrIndex>:$waitDevnum,
+      Variadic<IntOrIndex>:$waitOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+      OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -1252,15 +1255,41 @@ def OpenACC_DataOp : OpenACC_Op<"data",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
+
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly();
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue();
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the wait attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWaitOnly();
+    /// Return true if the op has the wait attribute for the given device_type.
+    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
+    /// Return the values of the wait clause if present.
+    mlir::Operation::operand_range getWaitValues();
+    /// Return the values of the wait clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `if` `(` $ifCond `)`
-      | `async` `(` $async `:` type($async) `)`
+      | `async` `(` custom<DeviceTypeOperands>($async,
+            type($async), $asyncDeviceType) `)`
       | `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
       | `wait_devnum` `(` $waitDevnum `:` type($waitDevnum) `)`
-      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
+      | `wait` `(` custom<DeviceTypeOperandsWithSegment>($waitOperands,
+            type($waitOperands), $waitOperandsDeviceType, 
+            $waitOperandsSegments) `)`
     )
     $region attr-dict-with-keyword
   }];
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index e299b67b10a9c..66605ead0529d 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -1417,11 +1417,52 @@ unsigned DataOp::getNumDataOperands() { return getDataClauseOperands().size(); }
 
 Value DataOp::getDataOperand(unsigned i) {
   unsigned numOptional = getIfCond() ? 1 : 0;
-  numOptional += getAsync() ? 1 : 0;
+  numOptional += getAsync().size() ? 1 : 0;
   numOptional += getWaitOperands().size();
   return getOperand(numOptional + i);
 }
 
+bool acc::DataOp::hasAsyncOnly() {
+  return hasAsyncOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::DataOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getAsyncOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value DataOp::getAsyncValue() {
+  return getAsyncValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value DataOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
+                                     deviceType);
+}
+
+bool DataOp::hasWaitOnly() { return hasWaitOnly(mlir::acc::DeviceType::None); }
+
+bool DataOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getWaitOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Operation::operand_range DataOp::getWaitValues() {
+  return getWaitValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+DataOp::getWaitValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
+                               getWaitOperandsSegments(), deviceType);
+}
+
 //===----------------------------------------------------------------------===//
 // ExitDataOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 5a95811685f84..52375b1af3141 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -836,11 +836,11 @@ func.func @testdataop(%a: memref<f32>, %b: memref<f32>, %c: memref<f32>) -> () {
   } attributes { defaultAttr = #acc<defaultvalue none>, wait }
 
   %w1 = arith.constant 1 : i64
-  acc.data wait(%w1 : i64) {
+  acc.data wait({%w1 : i64}) {
   } attributes { defaultAttr = #acc<defaultvalue none>, wait }
 
   %wd1 = arith.constant 1 : i64
-  acc.data wait_devnum(%wd1 : i64) wait(%w1 : i64) {
+  acc.data wait_devnum(%wd1 : i64) wait({%w1 : i64}) {
   } attributes { defaultAttr = #acc<defaultvalue none>, wait }
 
   return
@@ -951,10 +951,10 @@ func.func @testdataop(%a: memref<f32>, %b: memref<f32>, %c: memref<f32>) -> () {
 // CHECK:      acc.data {
 // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>, wait}
 
-// CHECK:      acc.data wait(%{{.*}} : i64) {
+// CHECK:      acc.data wait({%{{.*}} : i64}) {
 // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>, wait}
 
-// CHECK:      acc.data wait_devnum(%{{.*}} : i64) wait(%{{.*}} : i64) {
+// CHECK:      acc.data wait_devnum(%{{.*}} : i64) wait({%{{.*}} : i64}) {
 // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>, wait}
 
 // -----

From e456689fb3d6dd785202cd25f89e9443e5ad7d1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 4 Jan 2024 16:33:33 -0800
Subject: [PATCH 298/313] [mlir][flang][openacc] Support device_type on loop
 construct (#76892)

This is adding support for `device_type` clause representation in the
OpenACC MLIR dialect on the acc.loop operation and adjust flang to lower
correctly to the new representation.

Each "value" that can be impacted by a `device_type` clause is now
associated with an array attribute that carry this information. This
includes:
- `worker` clause information
- `gang` clause information
- `vector` clause information
- `collapse` clause information
- `tile` clause information

The representation of the `gang` clause information has been updated and
all values are now carried in a single operand segment. This segment is
then subdivided by `device_type`. Each value in a segment is also
associated with a `GangArgType` so it can be differentiated
(num/dim/static). This simplify the handling of gang values an limit the
number of new attributes needed.

When the clause can be associated with the operation without any value
(`gang`, `vector`, `worker`). These are represented by a dedicated
attributes with device_type information.

Extra getter functions are provided to make it easier to retrieve a
value based on a device_type.
---
 flang/lib/Lower/OpenACC.cpp                   | 183 +++++---
 flang/test/Lower/OpenACC/acc-kernels-loop.f90 |  36 +-
 flang/test/Lower/OpenACC/acc-loop.f90         |  41 +-
 .../test/Lower/OpenACC/acc-parallel-loop.f90  |  36 +-
 flang/test/Lower/OpenACC/acc-reduction.f90    |   6 +-
 flang/test/Lower/OpenACC/acc-serial-loop.f90  |  36 +-
 .../mlir/Dialect/OpenACC/OpenACCOps.td        | 141 +++++-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       | 427 ++++++++++++++----
 mlir/test/Dialect/OpenACC/invalid.mlir        |  57 ++-
 mlir/test/Dialect/OpenACC/ops.mlir            |  80 ++--
 10 files changed, 737 insertions(+), 306 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index d10e56e5d1177..d24c369d81bed 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1593,67 +1593,89 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
              const Fortran::parser::AccClauseList &accClauseList,
              bool needEarlyReturnHandling = false) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-
-  mlir::Value workerNum;
-  mlir::Value vectorNum;
-  mlir::Value gangNum;
-  mlir::Value gangDim;
-  mlir::Value gangStatic;
   llvm::SmallVector<mlir::Value> tileOperands, privateOperands,
-      reductionOperands, cacheOperands;
+      reductionOperands, cacheOperands, vectorOperands, workerNumOperands,
+      gangOperands;
   llvm::SmallVector<mlir::Attribute> privatizations, reductionRecipes;
-  bool hasGang = false, hasVector = false, hasWorker = false;
+  llvm::SmallVector<int32_t> tileOperandsSegments, gangOperandsSegments;
+  llvm::SmallVector<int64_t> collapseValues;
+
+  llvm::SmallVector<mlir::Attribute> gangArgTypes;
+  llvm::SmallVector<mlir::Attribute> seqDeviceTypes, independentDeviceTypes,
+      autoDeviceTypes, vectorOperandsDeviceTypes, workerNumOperandsDeviceTypes,
+      vectorDeviceTypes, workerNumDeviceTypes, tileOperandsDeviceTypes,
+      collapseDeviceTypes, gangDeviceTypes, gangOperandsDeviceTypes;
+
+  // device_type attribute is set to `none` until a device_type clause is
+  // encountered.
+  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None);
 
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *gangClause =
             std::get_if<Fortran::parser::AccClause::Gang>(&clause.u)) {
       if (gangClause->v) {
+        auto crtGangOperands = gangOperands.size();
         const Fortran::parser::AccGangArgList &x = *gangClause->v;
         for (const Fortran::parser::AccGangArg &gangArg : x.v) {
           if (const auto *num =
                   std::get_if<Fortran::parser::AccGangArg::Num>(&gangArg.u)) {
-            gangNum = fir::getBase(converter.genExprValue(
-                *Fortran::semantics::GetExpr(num->v), stmtCtx));
+            gangOperands.push_back(fir::getBase(converter.genExprValue(
+                *Fortran::semantics::GetExpr(num->v), stmtCtx)));
+            gangArgTypes.push_back(mlir::acc::GangArgTypeAttr::get(
+                builder.getContext(), mlir::acc::GangArgType::Num));
           } else if (const auto *staticArg =
                          std::get_if<Fortran::parser::AccGangArg::Static>(
                              &gangArg.u)) {
             const Fortran::parser::AccSizeExpr &sizeExpr = staticArg->v;
             if (sizeExpr.v) {
-              gangStatic = fir::getBase(converter.genExprValue(
-                  *Fortran::semantics::GetExpr(*sizeExpr.v), stmtCtx));
+              gangOperands.push_back(fir::getBase(converter.genExprValue(
+                  *Fortran::semantics::GetExpr(*sizeExpr.v), stmtCtx)));
             } else {
               // * was passed as value and will be represented as a special
               // constant.
-              gangStatic = builder.createIntegerConstant(
-                  clauseLocation, builder.getIndexType(), starCst);
+              gangOperands.push_back(builder.createIntegerConstant(
+                  clauseLocation, builder.getIndexType(), starCst));
             }
+            gangArgTypes.push_back(mlir::acc::GangArgTypeAttr::get(
+                builder.getContext(), mlir::acc::GangArgType::Static));
           } else if (const auto *dim =
                          std::get_if<Fortran::parser::AccGangArg::Dim>(
                              &gangArg.u)) {
-            gangDim = fir::getBase(converter.genExprValue(
-                *Fortran::semantics::GetExpr(dim->v), stmtCtx));
+            gangOperands.push_back(fir::getBase(converter.genExprValue(
+                *Fortran::semantics::GetExpr(dim->v), stmtCtx)));
+            gangArgTypes.push_back(mlir::acc::GangArgTypeAttr::get(
+                builder.getContext(), mlir::acc::GangArgType::Dim));
           }
         }
+        gangOperandsSegments.push_back(gangOperands.size() - crtGangOperands);
+        gangOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+      } else {
+        gangDeviceTypes.push_back(crtDeviceTypeAttr);
       }
-      hasGang = true;
     } else if (const auto *workerClause =
                    std::get_if<Fortran::parser::AccClause::Worker>(&clause.u)) {
       if (workerClause->v) {
-        workerNum = fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*workerClause->v), stmtCtx));
+        workerNumOperands.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*workerClause->v), stmtCtx)));
+        workerNumOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+      } else {
+        workerNumDeviceTypes.push_back(crtDeviceTypeAttr);
       }
-      hasWorker = true;
     } else if (const auto *vectorClause =
                    std::get_if<Fortran::parser::AccClause::Vector>(&clause.u)) {
       if (vectorClause->v) {
-        vectorNum = fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*vectorClause->v), stmtCtx));
+        vectorOperands.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*vectorClause->v), stmtCtx)));
+        vectorOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+      } else {
+        vectorDeviceTypes.push_back(crtDeviceTypeAttr);
       }
-      hasVector = true;
     } else if (const auto *tileClause =
                    std::get_if<Fortran::parser::AccClause::Tile>(&clause.u)) {
       const Fortran::parser::AccTileExprList &accTileExprList = tileClause->v;
+      auto crtTileOperands = tileOperands.size();
       for (const auto &accTileExpr : accTileExprList.v) {
         const auto &expr =
             std::get<std::optional<Fortran::parser::ScalarIntConstantExpr>>(
@@ -1669,6 +1691,8 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
           tileOperands.push_back(tileStar);
         }
       }
+      tileOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+      tileOperandsSegments.push_back(tileOperands.size() - crtTileOperands);
     } else if (const auto *privateClause =
                    std::get_if<Fortran::parser::AccClause::Private>(
                        &clause.u)) {
@@ -1680,17 +1704,46 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
                        &clause.u)) {
       genReductions(reductionClause->v, converter, semanticsContext, stmtCtx,
                     reductionOperands, reductionRecipes);
+    } else if (std::get_if<Fortran::parser::AccClause::Seq>(&clause.u)) {
+      seqDeviceTypes.push_back(crtDeviceTypeAttr);
+    } else if (std::get_if<Fortran::parser::AccClause::Independent>(
+                   &clause.u)) {
+      independentDeviceTypes.push_back(crtDeviceTypeAttr);
+    } else if (std::get_if<Fortran::parser::AccClause::Auto>(&clause.u)) {
+      autoDeviceTypes.push_back(crtDeviceTypeAttr);
+    } else if (const auto *deviceTypeClause =
+                   std::get_if<Fortran::parser::AccClause::DeviceType>(
+                       &clause.u)) {
+      const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
+          deviceTypeClause->v;
+      assert(deviceTypeExprList.v.size() == 1 &&
+             "expect only one device_type expr");
+      crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+          builder.getContext(), getDeviceType(deviceTypeExprList.v.front().v));
+    } else if (const auto *collapseClause =
+                   std::get_if<Fortran::parser::AccClause::Collapse>(
+                       &clause.u)) {
+      const Fortran::parser::AccCollapseArg &arg = collapseClause->v;
+      const auto &force = std::get<bool>(arg.t);
+      if (force)
+        TODO(clauseLocation, "OpenACC collapse force modifier");
+      const auto &intExpr =
+          std::get<Fortran::parser::ScalarIntConstantExpr>(arg.t);
+      const auto *expr = Fortran::semantics::GetExpr(intExpr);
+      const std::optional<int64_t> collapseValue =
+          Fortran::evaluate::ToInt64(*expr);
+      assert(collapseValue && "expect integer value for the collapse clause");
+      collapseValues.push_back(*collapseValue);
+      collapseDeviceTypes.push_back(crtDeviceTypeAttr);
     }
   }
 
   // Prepare the operand segment size attribute and the operands value range.
   llvm::SmallVector<mlir::Value> operands;
   llvm::SmallVector<int32_t> operandSegments;
-  addOperand(operands, operandSegments, gangNum);
-  addOperand(operands, operandSegments, gangDim);
-  addOperand(operands, operandSegments, gangStatic);
-  addOperand(operands, operandSegments, workerNum);
-  addOperand(operands, operandSegments, vectorNum);
+  addOperands(operands, operandSegments, gangOperands);
+  addOperands(operands, operandSegments, workerNumOperands);
+  addOperands(operands, operandSegments, vectorOperands);
   addOperands(operands, operandSegments, tileOperands);
   addOperands(operands, operandSegments, cacheOperands);
   addOperands(operands, operandSegments, privateOperands);
@@ -1708,12 +1761,42 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
       builder, currentLocation, eval, operands, operandSegments,
       /*outerCombined=*/false, retTy, yieldValue);
 
-  if (hasGang)
-    loopOp.setHasGangAttr(builder.getUnitAttr());
-  if (hasWorker)
-    loopOp.setHasWorkerAttr(builder.getUnitAttr());
-  if (hasVector)
-    loopOp.setHasVectorAttr(builder.getUnitAttr());
+  if (!gangDeviceTypes.empty())
+    loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes));
+  if (!gangArgTypes.empty())
+    loopOp.setGangOperandsArgTypeAttr(builder.getArrayAttr(gangArgTypes));
+  if (!gangOperandsSegments.empty())
+    loopOp.setGangOperandsSegmentsAttr(
+        builder.getDenseI32ArrayAttr(gangOperandsSegments));
+  if (!gangOperandsDeviceTypes.empty())
+    loopOp.setGangOperandsDeviceTypeAttr(
+        builder.getArrayAttr(gangOperandsDeviceTypes));
+
+  if (!workerNumDeviceTypes.empty())
+    loopOp.setWorkerAttr(builder.getArrayAttr(workerNumDeviceTypes));
+  if (!workerNumOperandsDeviceTypes.empty())
+    loopOp.setWorkerNumOperandsDeviceTypeAttr(
+        builder.getArrayAttr(workerNumOperandsDeviceTypes));
+
+  if (!vectorDeviceTypes.empty())
+    loopOp.setVectorAttr(builder.getArrayAttr(vectorDeviceTypes));
+  if (!vectorOperandsDeviceTypes.empty())
+    loopOp.setVectorOperandsDeviceTypeAttr(
+        builder.getArrayAttr(vectorOperandsDeviceTypes));
+
+  if (!tileOperandsDeviceTypes.empty())
+    loopOp.setTileOperandsDeviceTypeAttr(
+        builder.getArrayAttr(tileOperandsDeviceTypes));
+  if (!tileOperandsSegments.empty())
+    loopOp.setTileOperandsSegmentsAttr(
+        builder.getDenseI32ArrayAttr(tileOperandsSegments));
+
+  if (!seqDeviceTypes.empty())
+    loopOp.setSeqAttr(builder.getArrayAttr(seqDeviceTypes));
+  if (!independentDeviceTypes.empty())
+    loopOp.setIndependentAttr(builder.getArrayAttr(independentDeviceTypes));
+  if (!autoDeviceTypes.empty())
+    loopOp.setAuto_Attr(builder.getArrayAttr(autoDeviceTypes));
 
   if (!privatizations.empty())
     loopOp.setPrivatizationsAttr(
@@ -1723,33 +1806,11 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
     loopOp.setReductionRecipesAttr(
         mlir::ArrayAttr::get(builder.getContext(), reductionRecipes));
 
-  // Lower clauses mapped to attributes
-  for (const Fortran::parser::AccClause &clause : accClauseList.v) {
-    mlir::Location clauseLocation = converter.genLocation(clause.source);
-    if (const auto *collapseClause =
-            std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) {
-      const Fortran::parser::AccCollapseArg &arg = collapseClause->v;
-      const auto &force = std::get<bool>(arg.t);
-      if (force)
-        TODO(clauseLocation, "OpenACC collapse force modifier");
-      const auto &intExpr =
-          std::get<Fortran::parser::ScalarIntConstantExpr>(arg.t);
-      const auto *expr = Fortran::semantics::GetExpr(intExpr);
-      const std::optional<int64_t> collapseValue =
-          Fortran::evaluate::ToInt64(*expr);
-      if (collapseValue) {
-        loopOp.setCollapseAttr(builder.getI64IntegerAttr(*collapseValue));
-      }
-    } else if (std::get_if<Fortran::parser::AccClause::Seq>(&clause.u)) {
-      loopOp.setSeqAttr(builder.getUnitAttr());
-    } else if (std::get_if<Fortran::parser::AccClause::Independent>(
-                   &clause.u)) {
-      loopOp.setIndependentAttr(builder.getUnitAttr());
-    } else if (std::get_if<Fortran::parser::AccClause::Auto>(&clause.u)) {
-      loopOp->setAttr(mlir::acc::LoopOp::getAutoAttrStrName(),
-                      builder.getUnitAttr());
-    }
-  }
+  if (!collapseValues.empty())
+    loopOp.setCollapseAttr(builder.getI64ArrayAttr(collapseValues));
+  if (!collapseDeviceTypes.empty())
+    loopOp.setCollapseDeviceTypeAttr(builder.getArrayAttr(collapseDeviceTypes));
+
   return loopOp;
 }
 
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index 93bc699031d55..b17f2e2c80b20 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -461,7 +461,7 @@ subroutine acc_kernels_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {seq}
+! CHECK-NEXT:   } attributes {seq = [#acc.device_type<none>]}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -474,7 +474,7 @@ subroutine acc_kernels_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {auto}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -487,7 +487,7 @@ subroutine acc_kernels_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {independent}
+! CHECK-NEXT:   } attributes {independent = [#acc.device_type<none>]}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -497,10 +497,10 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop gang {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {gang = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -511,7 +511,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
-! CHECK-NEXT:   acc.loop gang(num=[[GANGNUM1]] : i32) {
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -525,7 +525,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK-NEXT:   acc.loop gang(num=[[GANGNUM2]] : i32) {
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -538,7 +538,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop gang(num=%{{.*}} : i32, static=%{{.*}} : i32) {
+! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -550,10 +550,10 @@ subroutine acc_kernels_loop
     a(i) = b(i)
   END DO
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop vector {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {vector = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -591,10 +591,10 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop worker {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {worker = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -624,7 +624,7 @@ subroutine acc_kernels_loop
 ! CHECK:          fir.do_loop
 ! CHECK:            fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = 2 : i64}
+! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -655,7 +655,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[TILESIZE:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile([[TILESIZE]] : i32) {
+! CHECK:        acc.loop tile({[[TILESIZE]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -669,7 +669,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[TILESIZEM1:%.*]] = arith.constant -1 : i32
-! CHECK:        acc.loop tile([[TILESIZEM1]] : i32) {
+! CHECK:        acc.loop tile({[[TILESIZEM1]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -686,7 +686,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels {
 ! CHECK:        [[TILESIZE1:%.*]] = arith.constant 2 : i32
 ! CHECK:        [[TILESIZE2:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile([[TILESIZE1]], [[TILESIZE2]] : i32, i32) {
+! CHECK:        acc.loop tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -699,7 +699,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop tile(%{{.*}} : i32) {
+! CHECK:        acc.loop tile({%{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -714,7 +714,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop tile(%{{.*}}, %{{.*}} : i32, i32) {
+! CHECK:        acc.loop tile({%{{.*}} : i32, %{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index 924574512da4c..e7f65770498fe 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -1,6 +1,5 @@
 ! This test checks lowering of OpenACC loop directive.
 
-! RUN: bbc -fopenacc -emit-fir -hlfir=false %s -o - | FileCheck %s
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: acc.private.recipe @privatization_ref_10x10xf32 : !fir.ref<!fir.array<10x10xf32>> init {
@@ -41,7 +40,7 @@ program acc_loop
 !CHECK:      acc.loop {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
-!CHECK-NEXT: } attributes {seq}
+!CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]}
 
   !$acc loop auto
   DO i = 1, n
@@ -51,7 +50,7 @@ program acc_loop
 !CHECK:      acc.loop {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
-!CHECK-NEXT: } attributes {auto}
+!CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]}
 
   !$acc loop independent
   DO i = 1, n
@@ -61,17 +60,17 @@ program acc_loop
 !CHECK:      acc.loop {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
-!CHECK-NEXT: } attributes {independent}
+!CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
 
   !$acc loop gang
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop gang {
+!CHECK:      acc.loop {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+!CHECK-NEXT: } attributes {gang = [#acc.device_type<none>]}{{$}}
 
   !$acc loop gang(num: 8)
   DO i = 1, n
@@ -79,7 +78,7 @@ program acc_loop
   END DO
 
 !CHECK:      [[GANGNUM1:%.*]] = arith.constant 8 : i32
-!CHECK-NEXT: acc.loop gang(num=[[GANGNUM1]] : i32) {
+!CHECK-NEXT: acc.loop gang({num=[[GANGNUM1]] : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -90,7 +89,7 @@ program acc_loop
   END DO
 
 !CHECK:      [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-!CHECK-NEXT: acc.loop gang(num=[[GANGNUM2]] : i32) {
+!CHECK-NEXT: acc.loop gang({num=[[GANGNUM2]] : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -100,7 +99,7 @@ program acc_loop
     a(i) = b(i)
   END DO
 
-!CHECK: acc.loop gang(num=%{{.*}} : i32, static=%{{.*}} : i32) {
+!CHECK: acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -110,10 +109,10 @@ program acc_loop
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop vector {
+!CHECK:      acc.loop {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+!CHECK-NEXT: } attributes {vector = [#acc.device_type<none>]}{{$}}
 
   !$acc loop vector(128)
   DO i = 1, n
@@ -142,10 +141,10 @@ program acc_loop
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop worker {
+!CHECK:      acc.loop {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+!CHECK-NEXT: } attributes {worker = [#acc.device_type<none>]}{{$}}
 
   !$acc loop worker(128)
   DO i = 1, n
@@ -193,7 +192,7 @@ program acc_loop
     a(i) = b(i)
   END DO
 !CHECK:      [[TILESIZE:%.*]] = arith.constant 2 : i32
-!CHECK:      acc.loop tile([[TILESIZE]] : i32) {
+!CHECK:      acc.loop tile({[[TILESIZE]] : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -203,7 +202,7 @@ program acc_loop
     a(i) = b(i)
   END DO
 !CHECK:      [[TILESIZEM1:%.*]] = arith.constant -1 : i32
-!CHECK:      acc.loop tile([[TILESIZEM1]] : i32) {
+!CHECK:      acc.loop tile({[[TILESIZEM1]] : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -217,7 +216,7 @@ program acc_loop
 
 !CHECK:      [[TILESIZE1:%.*]] = arith.constant 2 : i32
 !CHECK:      [[TILESIZE2:%.*]] = arith.constant 2 : i32
-!CHECK:      acc.loop tile([[TILESIZE1]], [[TILESIZE2]] : i32, i32) {
+!CHECK:      acc.loop tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -227,7 +226,7 @@ program acc_loop
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop tile(%{{.*}} : i32) {
+!CHECK:      acc.loop tile({%{{.*}} : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -239,7 +238,7 @@ program acc_loop
     END DO
   END DO
 
-!CHECK:      acc.loop tile(%{{.*}}, %{{.*}} : i32, i32) {
+!CHECK:      acc.loop tile({%{{.*}} : i32, %{{.*}} : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -255,7 +254,7 @@ program acc_loop
 !CHECK:        fir.do_loop
 !CHECK:          fir.do_loop
 !CHECK:        acc.yield
-!CHECK-NEXT: } attributes {collapse = 2 : i64}
+!CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
 
   !$acc loop
   DO i = 1, n
@@ -290,7 +289,7 @@ program acc_loop
     a(i) = b(i)
   END DO
 
-!CHECK: acc.loop gang(dim=%{{.*}}, static=%{{.*}} : i32) {
+!CHECK: acc.loop gang({dim=%{{.*}}, static=%{{.*}} : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -301,7 +300,7 @@ program acc_loop
   END DO
 
 !CHECK:      [[GANGDIM1:%.*]] = arith.constant 1 : i32
-!CHECK-NEXT: acc.loop gang(dim=[[GANGDIM1]] : i32) {
+!CHECK-NEXT: acc.loop gang({dim=[[GANGDIM1]] : i32}) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index deee7089033ea..e9150a71f3826 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -476,7 +476,7 @@ subroutine acc_parallel_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {seq}
+! CHECK-NEXT:   } attributes {seq = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -489,7 +489,7 @@ subroutine acc_parallel_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {auto}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -502,7 +502,7 @@ subroutine acc_parallel_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {independent}
+! CHECK-NEXT:   } attributes {independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -512,10 +512,10 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop gang {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {gang = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -526,7 +526,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
-! CHECK-NEXT:   acc.loop gang(num=[[GANGNUM1]] : i32) {
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -540,7 +540,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK-NEXT:   acc.loop gang(num=[[GANGNUM2]] : i32) {
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -553,7 +553,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop gang(num=%{{.*}} : i32, static=%{{.*}} : i32) {
+! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -565,10 +565,10 @@ subroutine acc_parallel_loop
     a(i) = b(i)
   END DO
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop vector {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {vector = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -606,10 +606,10 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop worker {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {worker = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -639,7 +639,7 @@ subroutine acc_parallel_loop
 ! CHECK:          fir.do_loop
 ! CHECK:            fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = 2 : i64}
+! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -670,7 +670,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[TILESIZE:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile([[TILESIZE]] : i32) {
+! CHECK:        acc.loop tile({[[TILESIZE]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -684,7 +684,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[TILESIZEM1:%.*]] = arith.constant -1 : i32
-! CHECK:        acc.loop tile([[TILESIZEM1]] : i32) {
+! CHECK:        acc.loop tile({[[TILESIZEM1]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -701,7 +701,7 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.parallel {
 ! CHECK:        [[TILESIZE1:%.*]] = arith.constant 2 : i32
 ! CHECK:        [[TILESIZE2:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile([[TILESIZE1]], [[TILESIZE2]] : i32, i32) {
+! CHECK:        acc.loop tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -714,7 +714,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop tile(%{{.*}} : i32) {
+! CHECK:        acc.loop tile({%{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -729,7 +729,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop tile(%{{.*}}, %{{.*}} : i32, i32) {
+! CHECK:        acc.loop tile({%{{.*}} : i32, %{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index a8f7e1fa81ef7..dcfa77c9f97db 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -743,7 +743,7 @@ subroutine acc_reduction_add_int_array_2d(a, b)
 ! FIR:         %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100x10xi32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xi32>> {name = "b"}
 ! HLFIR:       %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10xi32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xi32>> {name = "b"}
 ! CHECK:       acc.loop reduction(@reduction_add_section_ext100xext10_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>) {
-! CHECK: } attributes {collapse = 2 : i64}
+! CHECK: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
 
 subroutine acc_reduction_add_int_array_3d(a, b)
   integer :: a(100, 10, 2), b(100, 10, 2)
@@ -765,7 +765,7 @@ subroutine acc_reduction_add_int_array_3d(a, b)
 ! FIR:   %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100x10x2xi32>>) bounds(%{{.*}}, %{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10x2xi32>> {name = "b"}
 ! HLFIR: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10x2xi32>>) bounds(%{{.*}}, %{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10x2xi32>> {name = "b"}
 ! CHECK: acc.loop reduction(@reduction_add_section_ext100xext10xext2_ref_100x10x2xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10x2xi32>>)
-! CHECK: } attributes {collapse = 3 : i64}
+! CHECK: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>]}
 
 subroutine acc_reduction_add_float(a, b)
   real :: a(100), b
@@ -938,7 +938,7 @@ subroutine acc_reduction_min_float_array2d(a, b)
 ! FIR:   %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100x10xf32>>) bounds(%3, %5) -> !fir.ref<!fir.array<100x10xf32>> {name = "b"}
 ! HLFIR: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xf32>> {name = "b"}
 ! CHECK: acc.loop reduction(@reduction_min_section_ext100xext10_ref_100x10xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xf32>>)
-! CHECK: attributes {collapse = 2 : i64}
+! CHECK: attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
 
 subroutine acc_reduction_max_int(a, b)
   integer :: a(100)
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index 712bfc80ce387..6041e7fb1b490 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -411,7 +411,7 @@ subroutine acc_serial_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {seq}
+! CHECK-NEXT:   } attributes {seq = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -424,7 +424,7 @@ subroutine acc_serial_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {auto}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -437,7 +437,7 @@ subroutine acc_serial_loop
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {independent}
+! CHECK-NEXT:   } attributes {independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -447,10 +447,10 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop gang {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {gang = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -461,7 +461,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
-! CHECK-NEXT:   acc.loop gang(num=[[GANGNUM1]] : i32) {
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -475,7 +475,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK-NEXT:   acc.loop gang(num=[[GANGNUM2]] : i32) {
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -488,7 +488,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop gang(num=%{{.*}} : i32, static=%{{.*}} : i32) {
+! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -500,10 +500,10 @@ subroutine acc_serial_loop
     a(i) = b(i)
   END DO
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop vector {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {vector = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -541,10 +541,10 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop worker {
+! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {worker = [#acc.device_type<none>]}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -574,7 +574,7 @@ subroutine acc_serial_loop
 ! CHECK:          fir.do_loop
 ! CHECK:            fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = 2 : i64}
+! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -605,7 +605,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[TILESIZE:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile([[TILESIZE]] : i32) {
+! CHECK:        acc.loop tile({[[TILESIZE]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -619,7 +619,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[TILESIZEM1:%.*]] = arith.constant -1 : i32
-! CHECK:        acc.loop tile([[TILESIZEM1]] : i32) {
+! CHECK:        acc.loop tile({[[TILESIZEM1]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -636,7 +636,7 @@ subroutine acc_serial_loop
 ! CHECK:      acc.serial {
 ! CHECK:        [[TILESIZE1:%.*]] = arith.constant 2 : i32
 ! CHECK:        [[TILESIZE2:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile([[TILESIZE1]], [[TILESIZE2]] : i32, i32) {
+! CHECK:        acc.loop tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -649,7 +649,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop tile(%{{.*}} : i32) {
+! CHECK:        acc.loop tile({%{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
@@ -664,7 +664,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop tile(%{{.*}}, %{{.*}} : i32, i32) {
+! CHECK:        acc.loop tile({%{{.*}} : i32, %{{.*}} : i32}) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 1dd83e933034a..e6954062a50e0 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -196,6 +196,27 @@ def DeviceTypeArrayAttr :
   let constBuilderCall = ?;
 }
 
+// Gang arg type enumeration
+def OpenACC_GangArgNum      : I32EnumAttrCase<"Num", 0, "Num">;
+def OpenACC_GangArgDim      : I32EnumAttrCase<"Dim", 1, "Dim">;
+def OpenACC_GangArgStatic   : I32EnumAttrCase<"Static", 2, "Static">;
+
+def OpenACC_GangArgType : I32EnumAttr<"GangArgType",
+    "Differentiate the different gang arg values",
+    [OpenACC_GangArgNum, OpenACC_GangArgDim, OpenACC_GangArgStatic]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::acc";
+}
+def OpenACC_GangArgTypeAttr : EnumAttr<OpenACC_Dialect,
+                                       OpenACC_GangArgType,
+                                       "gang_arg_type"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
+def GangArgTypeArrayAttr :
+  TypedArrayAttrBase<OpenACC_GangArgTypeAttr, "gang arg type array attribute"> {
+  let constBuilderCall = ?;
+}
+
 // Define a resource for the OpenACC runtime counters.
 def OpenACC_RuntimeCounters : Resource<"::mlir::acc::RuntimeCounters">;
 
@@ -1462,7 +1483,7 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     Example:
 
     ```mlir
-    acc.loop gang vector {
+    acc.loop {
       scf.for %arg3 = %c0 to %c10 step %c1 {
         scf.for %arg4 = %c0 to %c10 step %c1 {
           scf.for %arg5 = %c0 to %c10 step %c1 {
@@ -1471,23 +1492,33 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
         }
       }
       acc.yield
-    } attributes { collapse = 3 }
+    } attributes {
+      collapse = [3], gang = [#acc.device_type<none>],
+      vector = [#acc.device_type<none>]
+    }
     ```
   }];
 
-  let arguments = (ins OptionalAttr<I64Attr>:$collapse,
-      Optional<IntOrIndex>:$gangNum,
-      Optional<IntOrIndex>:$gangDim,
-      Optional<IntOrIndex>:$gangStatic,
-      Optional<IntOrIndex>:$workerNum,
-      Optional<IntOrIndex>:$vectorLength,
-      UnitAttr:$seq,
-      UnitAttr:$independent,
-      UnitAttr:$auto_,
-      UnitAttr:$hasGang,
-      UnitAttr:$hasWorker,
-      UnitAttr:$hasVector,
+  let arguments = (ins
+      OptionalAttr<I64ArrayAttr>:$collapse,
+      OptionalAttr<DeviceTypeArrayAttr>:$collapseDeviceType,
+      Variadic<IntOrIndex>:$gangOperands,
+      OptionalAttr<GangArgTypeArrayAttr>:$gangOperandsArgType,
+      OptionalAttr<DenseI32ArrayAttr>:$gangOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$gangOperandsDeviceType,
+      Variadic<IntOrIndex>:$workerNumOperands,
+      OptionalAttr<DeviceTypeArrayAttr>:$workerNumOperandsDeviceType,
+      Variadic<IntOrIndex>:$vectorOperands,
+      OptionalAttr<DeviceTypeArrayAttr>:$vectorOperandsDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$seq,
+      OptionalAttr<DeviceTypeArrayAttr>:$independent,
+      OptionalAttr<DeviceTypeArrayAttr>:$auto_,
+      OptionalAttr<DeviceTypeArrayAttr>:$gang,
+      OptionalAttr<DeviceTypeArrayAttr>:$worker,
+      OptionalAttr<DeviceTypeArrayAttr>:$vector,
       Variadic<IntOrIndex>:$tileOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$tileOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$tileOperandsDeviceType,
       Variadic<OpenACC_PointerLikeTypeInterface>:$cacheOperands,
       Variadic<OpenACC_PointerLikeTypeInterface>:$privateOperands,
       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
@@ -1510,18 +1541,90 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
+
+    /// Return true if the op has the auto attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAuto();
+    /// Return true if the op has the auto attribute for the given device_type.
+    bool hasAuto(mlir::acc::DeviceType deviceType);
+    /// Return true if the op has the independent attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasIndependent();
+    /// Return true if the op has the independent attribute for the given
+    /// device_type.
+    bool hasIndependent(mlir::acc::DeviceType deviceType);
+    /// Return true if the op has the seq attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasSeq();
+    /// Return true if the op has the seq attribute for the given device_type.
+    bool hasSeq(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the vector clause if present.
+    mlir::Value getVectorValue();
+    /// Return the value of the vector clause for the given device_type 
+    /// if present.
+    mlir::Value getVectorValue(mlir::acc::DeviceType deviceType);
+    /// Return true if the op has the vector attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasVector();
+    /// Return true if the op has the vector attribute for the given
+    /// device_type.
+    bool hasVector(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the worker clause if present.
+    mlir::Value getWorkerValue();
+    /// Return the value of the worker clause for the given device_type 
+    /// if present.
+    mlir::Value getWorkerValue(mlir::acc::DeviceType deviceType);
+    /// Return true if the op has the worker attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWorker();
+    /// Return true if the op has the worker attribute for the given
+    /// device_type.
+    bool hasWorker(mlir::acc::DeviceType deviceType);
+
+    /// Return the values of the tile clause if present.
+    mlir::Operation::operand_range getTileValues();
+    /// Return the values of the tile clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getTileValues(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the collapse clause if present.
+    std::optional<int64_t> getCollapseValue();
+    /// Return the value of the collapse clause for the given device_type 
+    /// if present.
+    std::optional<int64_t> getCollapseValue(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the gang attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasGang();
+    /// Return true if the op has the gang attribute for the given
+    /// device_type.
+    bool hasGang(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the worker clause if present.
+    mlir::Value getGangValue(mlir::acc::GangArgType gangArgType);
+    /// Return the value of the worker clause for the given device_type 
+    /// if present.
+    mlir::Value getGangValue(mlir::acc::GangArgType gangArgType, mlir::acc::DeviceType deviceType);
   }];
 
   let hasCustomAssemblyFormat = 1;
   let assemblyFormat = [{
     oilist(
-        `gang` `` custom<GangClause>($gangNum, type($gangNum), $gangDim, type($gangDim), $gangStatic, type($gangStatic), $hasGang)
-      | `worker` `` custom<WorkerClause>($workerNum, type($workerNum), $hasWorker)
-      | `vector` `` custom<VectorClause>($vectorLength, type($vectorLength), $hasVector)
+        `gang` `` `(` custom<GangClause>($gangOperands, type($gangOperands),
+            $gangOperandsArgType, $gangOperandsDeviceType,
+            $gangOperandsSegments) `)`
+      | `worker` `` `(` custom<DeviceTypeOperands>($workerNumOperands,
+            type($workerNumOperands), $workerNumOperandsDeviceType) `)`
+      | `vector` `` `(` custom<DeviceTypeOperands>($vectorOperands,
+            type($vectorOperands), $vectorOperandsDeviceType) `)`
       | `private` `(` custom<SymOperandList>(
-            $privateOperands, type($privateOperands), $privatizations)
+            $privateOperands, type($privateOperands), $privatizations) `)`
+      | `tile` `(` custom<DeviceTypeOperandsWithSegment>($tileOperands,
+            type($tileOperands), $tileOperandsDeviceType, $tileOperandsSegments)
         `)`
-      | `tile` `(` $tileOperands `:` type($tileOperands) `)`
       | `reduction` `(` custom<SymOperandList>(
             $reductionOperands, type($reductionOperands), $reductionRecipes)
         `)`
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 66605ead0529d..c53673fa42603 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -16,6 +16,7 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
@@ -886,6 +887,8 @@ static ParseResult parseDeviceTypeOperandsWithSegment(
     if (failed(parser.parseLBrace()))
       return failure();
 
+    int32_t crtOperandsSize = operands.size();
+
     if (failed(parser.parseCommaSeparatedList(
             mlir::AsmParser::Delimiter::None, [&]() {
               if (parser.parseOperand(operands.emplace_back()) ||
@@ -895,7 +898,7 @@ static ParseResult parseDeviceTypeOperandsWithSegment(
             })))
       return failure();
 
-    seg.push_back(operands.size());
+    seg.push_back(operands.size() - crtOperandsSize);
 
     if (failed(parser.parseRBrace()))
       return failure();
@@ -1207,16 +1210,19 @@ void acc::HostDataOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // LoopOp
 //===----------------------------------------------------------------------===//
 
-static ParseResult
-parseGangValue(OpAsmParser &parser, llvm::StringRef keyword,
-               std::optional<OpAsmParser::UnresolvedOperand> &value,
-               Type &valueType, bool &needComa, bool &newValue) {
+static ParseResult parseGangValue(
+    OpAsmParser &parser, llvm::StringRef keyword,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
+    llvm::SmallVectorImpl<Type> &types,
+    llvm::SmallVector<GangArgTypeAttr> &attributes, GangArgTypeAttr gangArgType,
+    bool &needComa, bool &newValue) {
   if (succeeded(parser.parseOptionalKeyword(keyword))) {
     if (parser.parseEqual())
       return failure();
-    value = OpAsmParser::UnresolvedOperand{};
-    if (parser.parseOperand(*value) || parser.parseColonType(valueType))
+    if (parser.parseOperand(operands.emplace_back()) ||
+        parser.parseColonType(types.emplace_back()))
       return failure();
+    attributes.push_back(gangArgType);
     needComa = true;
     newValue = true;
   }
@@ -1224,19 +1230,27 @@ parseGangValue(OpAsmParser &parser, llvm::StringRef keyword,
 }
 
 static ParseResult parseGangClause(
-    OpAsmParser &parser, std::optional<OpAsmParser::UnresolvedOperand> &gangNum,
-    Type &gangNumType, std::optional<OpAsmParser::UnresolvedOperand> &gangDim,
-    Type &gangDimType,
-    std::optional<OpAsmParser::UnresolvedOperand> &gangStatic,
-    Type &gangStaticType, UnitAttr &hasGang) {
-  hasGang = UnitAttr::get(parser.getBuilder().getContext());
-  gangNum = std::nullopt;
-  gangDim = std::nullopt;
-  gangStatic = std::nullopt;
+    OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &gangOperands,
+    llvm::SmallVectorImpl<Type> &gangOperandsType, mlir::ArrayAttr &gangArgType,
+    mlir::ArrayAttr &deviceType, mlir::DenseI32ArrayAttr &segments) {
+  llvm::SmallVector<GangArgTypeAttr> attributes;
+  llvm::SmallVector<DeviceTypeAttr> deviceTypeAttributes;
+  llvm::SmallVector<int32_t> seg;
   bool needComa = false;
 
-  // optional gang operands
-  if (succeeded(parser.parseOptionalLParen())) {
+  auto argNum = mlir::acc::GangArgTypeAttr::get(parser.getContext(),
+                                                mlir::acc::GangArgType::Num);
+  auto argDim = mlir::acc::GangArgTypeAttr::get(parser.getContext(),
+                                                mlir::acc::GangArgType::Dim);
+  auto argStatic = mlir::acc::GangArgTypeAttr::get(
+      parser.getContext(), mlir::acc::GangArgType::Static);
+
+  do {
+    if (failed(parser.parseLBrace()))
+      return failure();
+
+    int32_t crtOperandsSize = gangOperands.size();
     while (true) {
       bool newValue = false;
       bool needValue = false;
@@ -1247,15 +1261,17 @@ static ParseResult parseGangClause(
           break;
       }
 
-      if (failed(parseGangValue(parser, LoopOp::getGangNumKeyword(), gangNum,
-                                gangNumType, needComa, newValue)))
+      if (failed(parseGangValue(parser, LoopOp::getGangNumKeyword(),
+                                gangOperands, gangOperandsType, attributes,
+                                argNum, needComa, newValue)))
         return failure();
-      if (failed(parseGangValue(parser, LoopOp::getGangDimKeyword(), gangDim,
-                                gangDimType, needComa, newValue)))
+      if (failed(parseGangValue(parser, LoopOp::getGangDimKeyword(),
+                                gangOperands, gangOperandsType, attributes,
+                                argDim, needComa, newValue)))
         return failure();
       if (failed(parseGangValue(parser, LoopOp::getGangStaticKeyword(),
-                                gangStatic, gangStaticType, needComa,
-                                newValue)))
+                                gangOperands, gangOperandsType, attributes,
+                                argStatic, needComa, newValue)))
         return failure();
 
       if (!newValue && needValue) {
@@ -1268,86 +1284,168 @@ static ParseResult parseGangClause(
         break;
     }
 
-    if (!gangNum && !gangDim && !gangStatic) {
-      parser.emitError(parser.getCurrentLocation(),
-                       "expect at least one of num, dim or static values");
+    if (gangOperands.empty())
+      return parser.emitError(
+          parser.getCurrentLocation(),
+          "expect at least one of num, dim or static values");
+
+    if (failed(parser.parseRBrace()))
       return failure();
+
+    if (succeeded(parser.parseOptionalLSquare())) {
+      if (parser.parseAttribute(deviceTypeAttributes.emplace_back()) ||
+          parser.parseRSquare())
+        return failure();
+    } else {
+      deviceTypeAttributes.push_back(mlir::acc::DeviceTypeAttr::get(
+          parser.getContext(), mlir::acc::DeviceType::None));
     }
 
-    if (failed(parser.parseRParen()))
-      return failure();
-  }
+    seg.push_back(gangOperands.size() - crtOperandsSize);
+
+  } while (succeeded(parser.parseOptionalComma()));
+
+  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                               attributes.end());
+  gangArgType = ArrayAttr::get(parser.getContext(), arrayAttr);
+
+  llvm::SmallVector<mlir::Attribute> deviceTypeAttr(
+      deviceTypeAttributes.begin(), deviceTypeAttributes.end());
+  deviceType = ArrayAttr::get(parser.getContext(), deviceTypeAttr);
+  segments = DenseI32ArrayAttr::get(parser.getContext(), seg);
   return success();
 }
 
-void printGangClause(OpAsmPrinter &p, Operation *op, Value gangNum,
-                     Type gangNumType, Value gangDim, Type gangDimType,
-                     Value gangStatic, Type gangStaticType, UnitAttr hasGang) {
-  if (gangNum || gangStatic || gangDim) {
-    p << "(";
-    if (gangNum) {
-      p << LoopOp::getGangNumKeyword() << "=" << gangNum << " : "
-        << gangNumType;
-      if (gangStatic || gangDim)
-        p << ", ";
-    }
-    if (gangDim) {
-      p << LoopOp::getGangDimKeyword() << "=" << gangDim << " : "
-        << gangDimType;
-      if (gangStatic)
+void printGangClause(OpAsmPrinter &p, Operation *op,
+                     mlir::OperandRange operands, mlir::TypeRange types,
+                     std::optional<mlir::ArrayAttr> gangArgTypes,
+                     std::optional<mlir::ArrayAttr> deviceTypes,
+                     std::optional<mlir::DenseI32ArrayAttr> segments) {
+  unsigned opIdx = 0;
+  for (unsigned i = 0; i < deviceTypes->size(); ++i) {
+    if (i != 0)
+      p << ", ";
+    p << "{";
+    for (int32_t j = 0; j < (*segments)[i]; ++j) {
+      if (j != 0)
         p << ", ";
+      auto gangArgTypeAttr =
+          mlir::dyn_cast<mlir::acc::GangArgTypeAttr>((*gangArgTypes)[opIdx]);
+      if (gangArgTypeAttr.getValue() == mlir::acc::GangArgType::Num)
+        p << LoopOp::getGangNumKeyword();
+      else if (gangArgTypeAttr.getValue() == mlir::acc::GangArgType::Dim)
+        p << LoopOp::getGangDimKeyword();
+      else if (gangArgTypeAttr.getValue() == mlir::acc::GangArgType::Static)
+        p << LoopOp::getGangStaticKeyword();
+      p << "=" << operands[opIdx] << " : " << operands[opIdx].getType();
+      ++opIdx;
     }
-    if (gangStatic)
-      p << LoopOp::getGangStaticKeyword() << "=" << gangStatic << " : "
-        << gangStaticType;
-    p << ")";
-  }
-}
 
-static ParseResult
-parseWorkerClause(OpAsmParser &parser,
-                  std::optional<OpAsmParser::UnresolvedOperand> &workerNum,
-                  Type &workerNumType, UnitAttr &hasWorker) {
-  hasWorker = UnitAttr::get(parser.getBuilder().getContext());
-  if (succeeded(parser.parseOptionalLParen())) {
-    workerNum = OpAsmParser::UnresolvedOperand{};
-    if (parser.parseOperand(*workerNum) ||
-        parser.parseColonType(workerNumType) || parser.parseRParen())
-      return failure();
+    p << "}";
+    auto deviceTypeAttr =
+        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
+    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
+      p << " [" << (*deviceTypes)[i] << "]";
   }
-  return success();
 }
 
-void printWorkerClause(OpAsmPrinter &p, Operation *op, Value workerNum,
-                       Type workerNumType, UnitAttr hasWorker) {
-  if (workerNum)
-    p << "(" << workerNum << " : " << workerNumType << ")";
+bool hasDuplicateDeviceTypes(
+    std::optional<mlir::ArrayAttr> segments,
+    llvm::SmallSet<mlir::acc::DeviceType, 3> &deviceTypes) {
+  if (!segments)
+    return false;
+  for (auto attr : *segments) {
+    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+    if (deviceTypes.contains(deviceTypeAttr.getValue()))
+      return true;
+    deviceTypes.insert(deviceTypeAttr.getValue());
+  }
+  return false;
 }
 
-static ParseResult
-parseVectorClause(OpAsmParser &parser,
-                  std::optional<OpAsmParser::UnresolvedOperand> &vectorLength,
-                  Type &vectorLengthType, UnitAttr &hasVector) {
-  hasVector = UnitAttr::get(parser.getBuilder().getContext());
-  if (succeeded(parser.parseOptionalLParen())) {
-    vectorLength = OpAsmParser::UnresolvedOperand{};
-    if (parser.parseOperand(*vectorLength) ||
-        parser.parseColonType(vectorLengthType) || parser.parseRParen())
+/// Check for duplicates in the DeviceType array attribute.
+LogicalResult checkDeviceTypes(mlir::ArrayAttr deviceTypes) {
+  llvm::SmallSet<mlir::acc::DeviceType, 3> crtDeviceTypes;
+  if (!deviceTypes)
+    return success();
+  for (auto attr : deviceTypes) {
+    auto deviceTypeAttr =
+        mlir::dyn_cast_or_null<mlir::acc::DeviceTypeAttr>(attr);
+    if (!deviceTypeAttr)
+      return failure();
+    if (crtDeviceTypes.contains(deviceTypeAttr.getValue()))
       return failure();
+    crtDeviceTypes.insert(deviceTypeAttr.getValue());
   }
   return success();
 }
 
-void printVectorClause(OpAsmPrinter &p, Operation *op, Value vectorLength,
-                       Type vectorLengthType, UnitAttr hasVector) {
-  if (vectorLength)
-    p << "(" << vectorLength << " : " << vectorLengthType << ")";
-}
-
 LogicalResult acc::LoopOp::verify() {
+  // Check collapse
+  if (getCollapseAttr() && !getCollapseDeviceTypeAttr())
+    return emitOpError() << "collapse device_type attr must be define when"
+                         << " collapse attr is present";
+
+  if (getCollapseAttr() && getCollapseDeviceTypeAttr() &&
+      getCollapseAttr().getValue().size() !=
+          getCollapseDeviceTypeAttr().getValue().size())
+    return emitOpError() << "collapse attribute count must match collapse"
+                         << " device_type count";
+  if (failed(checkDeviceTypes(getCollapseDeviceTypeAttr())))
+    return emitOpError()
+           << "duplicate device_type found in collapseDeviceType attribute";
+
+  // Check gang
+  if (!getGangOperands().empty()) {
+    if (!getGangOperandsArgType())
+      return emitOpError() << "gangOperandsArgType attribute must be defined"
+                           << " when gang operands are present";
+
+    if (getGangOperands().size() !=
+        getGangOperandsArgTypeAttr().getValue().size())
+      return emitOpError() << "gangOperandsArgType attribute count must match"
+                           << " gangOperands count";
+  }
+  if (getGangAttr() && failed(checkDeviceTypes(getGangAttr())))
+    return emitOpError() << "duplicate device_type found in gang attribute";
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getGangOperands(), getGangOperandsSegmentsAttr(),
+          getGangOperandsDeviceTypeAttr(), "gang")))
+    return failure();
+
+  // Check worker
+  if (failed(checkDeviceTypes(getWorkerAttr())))
+    return emitOpError() << "duplicate device_type found in worker attribute";
+  if (failed(checkDeviceTypes(getWorkerNumOperandsDeviceTypeAttr())))
+    return emitOpError() << "duplicate device_type found in "
+                            "workerNumOperandsDeviceType attribute";
+  if (failed(verifyDeviceTypeCountMatch(*this, getWorkerNumOperands(),
+                                        getWorkerNumOperandsDeviceTypeAttr(),
+                                        "worker")))
+    return failure();
+
+  // Check vector
+  if (failed(checkDeviceTypes(getVectorAttr())))
+    return emitOpError() << "duplicate device_type found in vector attribute";
+  if (failed(checkDeviceTypes(getVectorOperandsDeviceTypeAttr())))
+    return emitOpError() << "duplicate device_type found in "
+                            "vectorOperandsDeviceType attribute";
+  if (failed(verifyDeviceTypeCountMatch(*this, getVectorOperands(),
+                                        getVectorOperandsDeviceTypeAttr(),
+                                        "vector")))
+    return failure();
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getTileOperands(), getTileOperandsSegmentsAttr(),
+          getTileOperandsDeviceTypeAttr(), "tile")))
+    return failure();
+
   // auto, independent and seq attribute are mutually exclusive.
-  if ((getAuto_() && (getIndependent() || getSeq())) ||
-      (getIndependent() && getSeq())) {
+  llvm::SmallSet<mlir::acc::DeviceType, 3> deviceTypes;
+  if (hasDuplicateDeviceTypes(getAuto_(), deviceTypes) ||
+      hasDuplicateDeviceTypes(getIndependent(), deviceTypes) ||
+      hasDuplicateDeviceTypes(getSeq(), deviceTypes)) {
     return emitError() << "only one of \"" << acc::LoopOp::getAutoAttrStrName()
                        << "\", " << getIndependentAttrName() << ", "
                        << getSeqAttrName()
@@ -1355,8 +1453,24 @@ LogicalResult acc::LoopOp::verify() {
   }
 
   // Gang, worker and vector are incompatible with seq.
-  if (getSeq() && (getHasGang() || getHasWorker() || getHasVector()))
-    return emitError("gang, worker or vector cannot appear with the seq attr");
+  if (getSeqAttr()) {
+    for (auto attr : getSeqAttr()) {
+      auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+      if (hasVector(deviceTypeAttr.getValue()) ||
+          getVectorValue(deviceTypeAttr.getValue()) ||
+          hasWorker(deviceTypeAttr.getValue()) ||
+          getWorkerValue(deviceTypeAttr.getValue()) ||
+          hasGang(deviceTypeAttr.getValue()) ||
+          getGangValue(mlir::acc::GangArgType::Num,
+                       deviceTypeAttr.getValue()) ||
+          getGangValue(mlir::acc::GangArgType::Dim,
+                       deviceTypeAttr.getValue()) ||
+          getGangValue(mlir::acc::GangArgType::Static,
+                       deviceTypeAttr.getValue()))
+        return emitError()
+               << "gang, worker or vector cannot appear with the seq attr";
+    }
+  }
 
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getPrivateOperands(), "private",
@@ -1380,16 +1494,149 @@ unsigned LoopOp::getNumDataOperands() {
 }
 
 Value LoopOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getGangNum() ? 1 : 0;
-  numOptional += getGangDim() ? 1 : 0;
-  numOptional += getGangStatic() ? 1 : 0;
-  numOptional += getVectorLength() ? 1 : 0;
-  numOptional += getWorkerNum() ? 1 : 0;
+  unsigned numOptional = getGangOperands().size();
+  numOptional += getVectorOperands().size();
+  numOptional += getWorkerNumOperands().size();
   numOptional += getTileOperands().size();
   numOptional += getCacheOperands().size();
   return getOperand(numOptional + i);
 }
 
+bool LoopOp::hasAuto() { return hasAuto(mlir::acc::DeviceType::None); }
+
+bool LoopOp::hasAuto(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getAuto_()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+bool LoopOp::hasIndependent() {
+  return hasIndependent(mlir::acc::DeviceType::None);
+}
+
+bool LoopOp::hasIndependent(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getIndependent()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+bool LoopOp::hasSeq() { return hasSeq(mlir::acc::DeviceType::None); }
+
+bool LoopOp::hasSeq(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getSeq()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value LoopOp::getVectorValue() {
+  return getVectorValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value LoopOp::getVectorValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getVectorOperandsDeviceType(),
+                                     getVectorOperands(), deviceType);
+}
+
+bool LoopOp::hasVector() { return hasVector(mlir::acc::DeviceType::None); }
+
+bool LoopOp::hasVector(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getVector()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value LoopOp::getWorkerValue() {
+  return getWorkerValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value LoopOp::getWorkerValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getWorkerNumOperandsDeviceType(),
+                                     getWorkerNumOperands(), deviceType);
+}
+
+bool LoopOp::hasWorker() { return hasWorker(mlir::acc::DeviceType::None); }
+
+bool LoopOp::hasWorker(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getWorker()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Operation::operand_range LoopOp::getTileValues() {
+  return getTileValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+LoopOp::getTileValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getTileOperandsDeviceType(), getTileOperands(),
+                               getTileOperandsSegments(), deviceType);
+}
+
+std::optional<int64_t> LoopOp::getCollapseValue() {
+  return getCollapseValue(mlir::acc::DeviceType::None);
+}
+
+std::optional<int64_t>
+LoopOp::getCollapseValue(mlir::acc::DeviceType deviceType) {
+  if (!getCollapseAttr())
+    return std::nullopt;
+  if (auto pos = findSegment(getCollapseDeviceTypeAttr(), deviceType)) {
+    auto intAttr =
+        mlir::dyn_cast<IntegerAttr>(getCollapseAttr().getValue()[*pos]);
+    return intAttr.getValue().getZExtValue();
+  }
+  return std::nullopt;
+}
+
+mlir::Value LoopOp::getGangValue(mlir::acc::GangArgType gangArgType) {
+  return getGangValue(gangArgType, mlir::acc::DeviceType::None);
+}
+
+mlir::Value LoopOp::getGangValue(mlir::acc::GangArgType gangArgType,
+                                 mlir::acc::DeviceType deviceType) {
+  if (getGangOperands().empty())
+    return {};
+  if (auto pos = findSegment(*getGangOperandsDeviceType(), deviceType)) {
+    int32_t nbOperandsBefore = 0;
+    for (unsigned i = 0; i < *pos; ++i)
+      nbOperandsBefore += (*getGangOperandsSegments())[i];
+    mlir::Operation::operand_range values =
+        getGangOperands()
+            .drop_front(nbOperandsBefore)
+            .take_front((*getGangOperandsSegments())[*pos]);
+
+    int32_t argTypeIdx = nbOperandsBefore;
+    for (auto value : values) {
+      auto gangArgTypeAttr = mlir::dyn_cast<mlir::acc::GangArgTypeAttr>(
+          (*getGangOperandsArgType())[argTypeIdx]);
+      if (gangArgTypeAttr.getValue() == gangArgType)
+        return value;
+      ++argTypeIdx;
+    }
+  }
+  return {};
+}
+
+bool LoopOp::hasGang() { return hasGang(mlir::acc::DeviceType::None); }
+
+bool LoopOp::hasGang(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getGang()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // DataOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index c18d964b370f2..5dcdb3a37e4e3 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -1,58 +1,58 @@
 // RUN: mlir-opt -split-input-file -verify-diagnostics %s
 
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop gang {
+acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
-} attributes {seq}
+} attributes {seq = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
 
 // -----
 
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop worker {
+acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
-} attributes {seq}
+} attributes {seq = [#acc.device_type<none>], worker = [#acc.device_type<none>]}
 
 // -----
 
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop vector {
+acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
-} attributes {seq}
+} attributes {seq = [#acc.device_type<none>], vector = [#acc.device_type<none>]}
 
 // -----
 
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop gang worker {
+acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
-} attributes {seq}
+} attributes {seq = [#acc.device_type<none>], worker = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
 
 // -----
 
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop gang vector {
+acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
-} attributes {seq}
+} attributes {seq = [#acc.device_type<none>], vector = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
 
 // -----
 
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop worker vector {
+acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
-} attributes {seq}
+} attributes {seq = [#acc.device_type<none>], vector = [#acc.device_type<none>], worker = [#acc.device_type<none>]}
 
 // -----
 
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop gang worker vector {
+acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
-} attributes {seq}
+} attributes {seq = [#acc.device_type<none>], vector = [#acc.device_type<none>], worker = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
 
 // -----
 
@@ -62,10 +62,31 @@ acc.loop {
 
 // -----
 
+// expected-error@+1 {{'acc.loop' op duplicate device_type found in gang attribute}}
+acc.loop {
+  acc.yield
+} attributes {gang = [#acc.device_type<none>, #acc.device_type<none>]}
+
+// -----
+
+// expected-error@+1 {{'acc.loop' op duplicate device_type found in worker attribute}}
+acc.loop {
+  acc.yield
+} attributes {worker = [#acc.device_type<none>, #acc.device_type<none>]}
+
+// -----
+
+// expected-error@+1 {{'acc.loop' op duplicate device_type found in vector attribute}}
+acc.loop {
+  acc.yield
+} attributes {vector = [#acc.device_type<none>, #acc.device_type<none>]}
+
+// -----
+
 // expected-error@+1 {{only one of "auto", "independent", "seq" can be present at the same time}}
 acc.loop {
   acc.yield
-} attributes {auto_, seq}
+} attributes {auto_ = [#acc.device_type<none>], seq = [#acc.device_type<none>]}
 
 // -----
 
@@ -368,7 +389,7 @@ acc.firstprivate.recipe @privatization_i32 : i32 init {
 // -----
 
 // expected-error@+1 {{expected ')'}}
-acc.loop gang(static=%i64Value: i64, num=%i64Value: i64 {
+acc.loop gang({static=%i64Value: i64, num=%i64Value: i64} {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 }
@@ -437,7 +458,7 @@ acc.reduction.recipe @reduction_i64 : i64 reduction_operator<add> init {
 // -----
 
 // expected-error@+1 {{new value expected after comma}}
-acc.loop gang(static=%i64Value: i64, ) {
+acc.loop gang({static=%i64Value: i64, ) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 }
@@ -454,7 +475,7 @@ func.func @fct1(%0 : !llvm.ptr) -> () {
 // -----
 
 // expected-error@+1 {{expect at least one of num, dim or static values}}
-acc.loop gang() {
+acc.loop gang({}) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 }
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 52375b1af3141..ce5bfa490013e 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -11,7 +11,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
   %async = arith.constant 1 : i64
 
   acc.parallel async(%async: i64) {
-    acc.loop gang vector {
+    acc.loop {
       scf.for %arg3 = %c0 to %c10 step %c1 {
         scf.for %arg4 = %c0 to %c10 step %c1 {
           scf.for %arg5 = %c0 to %c10 step %c1 {
@@ -25,7 +25,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
         }
       }
       acc.yield
-    } attributes { collapse = 3 }
+    } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>], vector = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
     acc.yield
   }
 
@@ -38,7 +38,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
 //  CHECK-NEXT:   %{{.*}} = arith.constant 1 : index
 //  CHECK-NEXT:   [[ASYNC:%.*]] = arith.constant 1 : i64
 //  CHECK-NEXT:   acc.parallel async([[ASYNC]] : i64) {
-//  CHECK-NEXT:     acc.loop gang vector {
+//  CHECK-NEXT:     acc.loop {
 //  CHECK-NEXT:       scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
 //  CHECK-NEXT:         scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
 //  CHECK-NEXT:           scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
@@ -52,7 +52,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
 //  CHECK-NEXT:         }
 //  CHECK-NEXT:       }
 //  CHECK-NEXT:       acc.yield
-//  CHECK-NEXT:     } attributes {collapse = 3 : i64}
+//  CHECK-NEXT:     } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>], gang = [#acc.device_type<none>], vector = [#acc.device_type<none>]}
 //  CHECK-NEXT:     acc.yield
 //  CHECK-NEXT:   }
 //  CHECK-NEXT:   return %{{.*}} : memref<10x10xf32>
@@ -80,7 +80,7 @@ func.func @compute2(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
         }
       }
       acc.yield
-    } attributes {seq}
+    } attributes {seq = [#acc.device_type<none>]}
     acc.yield
   }
 
@@ -106,7 +106,7 @@ func.func @compute2(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
 //  CHECK-NEXT:         }
 //  CHECK-NEXT:       }
 //  CHECK-NEXT:       acc.yield
-//  CHECK-NEXT:     } attributes {seq}
+//  CHECK-NEXT:     } attributes {seq = [#acc.device_type<none>]}
 //  CHECK-NEXT:     acc.yield
 //  CHECK-NEXT:   }
 //  CHECK-NEXT:   return %{{.*}} : memref<10x10xf32>
@@ -138,9 +138,9 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
   acc.data dataOperands(%pa, %pb, %pc, %pd: memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
     %private = acc.private varPtr(%c : memref<10xf32>) -> memref<10xf32>
     acc.parallel num_gangs({%numGangs: i64}) num_workers(%numWorkers: i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %private : memref<10xf32>) {
-      acc.loop gang {
+      acc.loop {
         scf.for %x = %lb to %c10 step %st {
-          acc.loop worker {
+          acc.loop {
             scf.for %y = %lb to %c10 step %st {
               %axy = memref.load %a[%x, %y] : memref<10x10xf32>
               %bxy = memref.load %b[%x, %y] : memref<10x10xf32>
@@ -148,7 +148,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
               memref.store %tmp, %c[%y] : memref<10xf32>
             }
             acc.yield
-          }
+          } attributes {worker = [#acc.device_type<none>]}
 
           acc.loop {
             // for i = 0 to 10 step 1
@@ -160,10 +160,10 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
               memref.store %z, %d[%x] : memref<10xf32>
             }
             acc.yield
-          } attributes {seq}
+          } attributes {seq = [#acc.device_type<none>]}
         }
         acc.yield
-      }
+      } attributes {gang = [#acc.device_type<none>]}
       acc.yield
     }
     acc.terminator
@@ -181,9 +181,9 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK:        acc.data dataOperands(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
 // CHECK-NEXT:     %[[P_ARG2:.*]] = acc.private varPtr([[ARG2]] : memref<10xf32>) -> memref<10xf32> 
 // CHECK-NEXT:     acc.parallel num_gangs({[[NUMGANG]] : i64}) num_workers([[NUMWORKERS]] : i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %[[P_ARG2]] : memref<10xf32>) {
-// CHECK-NEXT:       acc.loop gang {
+// CHECK-NEXT:       acc.loop {
 // CHECK-NEXT:         scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
-// CHECK-NEXT:           acc.loop worker {
+// CHECK-NEXT:           acc.loop {
 // CHECK-NEXT:             scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
 // CHECK-NEXT:               %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
 // CHECK-NEXT:               %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
@@ -191,7 +191,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK-NEXT:               memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:             }
 // CHECK-NEXT:             acc.yield
-// CHECK-NEXT:           }
+// CHECK-NEXT:           } attributes {worker = [#acc.device_type<none>]}
 // CHECK-NEXT:           acc.loop {
 // CHECK-NEXT:             scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
 // CHECK-NEXT:               %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
@@ -200,10 +200,10 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK-NEXT:               memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:             }
 // CHECK-NEXT:             acc.yield
-// CHECK-NEXT:           } attributes {seq}
+// CHECK-NEXT:           } attributes {seq = [#acc.device_type<none>]}
 // CHECK-NEXT:         }
 // CHECK-NEXT:         acc.yield
-// CHECK-NEXT:       }
+// CHECK-NEXT:       } attributes {gang = [#acc.device_type<none>]}
 // CHECK-NEXT:       acc.yield
 // CHECK-NEXT:     }
 // CHECK-NEXT:     acc.terminator
@@ -218,15 +218,15 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
   %i32Value = arith.constant 128 : i32
   %idxValue = arith.constant 8 : index
 
-  acc.loop gang worker vector {
+  acc.loop {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang(num=%i64Value: i64) {
+  } attributes {vector = [#acc.device_type<none>], worker = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
+  acc.loop gang({num=%i64Value: i64}) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
-  acc.loop gang(static=%i64Value: i64) {
+  acc.loop gang({static=%i64Value: i64}) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
@@ -254,31 +254,31 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
-  acc.loop gang(num=%i64Value: i64) worker vector {
+  acc.loop gang({num=%i64Value: i64}) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang(num=%i64Value: i64, static=%i64Value: i64) worker(%i64Value: i64) vector(%i64Value: i64) {
+  } attributes {vector = [#acc.device_type<none>], worker = [#acc.device_type<none>]}
+  acc.loop gang({num=%i64Value: i64, static=%i64Value: i64}) worker(%i64Value: i64) vector(%i64Value: i64) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
-  acc.loop gang(num=%i32Value: i32, static=%idxValue: index) {
+  acc.loop gang({num=%i32Value: i32, static=%idxValue: index}) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
-  acc.loop tile(%i64Value, %i64Value : i64, i64) {
+  acc.loop tile({%i64Value : i64, %i64Value : i64}) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
-  acc.loop tile(%i32Value, %i32Value : i32, i32) {
+  acc.loop tile({%i32Value : i32, %i32Value : i32}) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
-  acc.loop gang(static=%i64Value: i64, num=%i64Value: i64) {
+  acc.loop gang({static=%i64Value: i64, num=%i64Value: i64}) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
-  acc.loop gang(dim=%i64Value : i64, static=%i64Value: i64) {
+  acc.loop gang({dim=%i64Value : i64, static=%i64Value: i64}) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
   }
@@ -293,15 +293,15 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
 // CHECK:      [[I64VALUE:%.*]] = arith.constant 1 : i64
 // CHECK-NEXT: [[I32VALUE:%.*]] = arith.constant 128 : i32
 // CHECK-NEXT: [[IDXVALUE:%.*]] = arith.constant 8 : index
-// CHECK:      acc.loop gang worker vector {
+// CHECK:      acc.loop {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang(num=[[I64VALUE]] : i64) {
+// CHECK-NEXT: } attributes {gang = [#acc.device_type<none>], vector = [#acc.device_type<none>], worker = [#acc.device_type<none>]}
+// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64}) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }
-// CHECK:      acc.loop gang(static=[[I64VALUE]] : i64) {
+// CHECK:      acc.loop gang({static=[[I64VALUE]] : i64}) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }
@@ -329,31 +329,31 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }
-// CHECK:      acc.loop gang(num=[[I64VALUE]] : i64) worker vector {
+// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64}) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang(num=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64) worker([[I64VALUE]] : i64) vector([[I64VALUE]] : i64) {
+// CHECK-NEXT: } attributes {vector = [#acc.device_type<none>], worker = [#acc.device_type<none>]}
+// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64}) worker([[I64VALUE]] : i64) vector([[I64VALUE]] : i64) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }
-// CHECK:      acc.loop gang(num=[[I32VALUE]] : i32, static=[[IDXVALUE]] : index) {
+// CHECK:      acc.loop gang({num=[[I32VALUE]] : i32, static=[[IDXVALUE]] : index}) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }
-// CHECK:      acc.loop tile([[I64VALUE]], [[I64VALUE]] : i64, i64) {
+// CHECK:      acc.loop tile({[[I64VALUE]] : i64, [[I64VALUE]] : i64}) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }
-// CHECK:      acc.loop tile([[I32VALUE]], [[I32VALUE]] : i32, i32) {
+// CHECK:      acc.loop tile({[[I32VALUE]] : i32, [[I32VALUE]] : i32}) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }
-// CHECK:      acc.loop gang(num=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64) {
+// CHECK:      acc.loop gang({static=[[I64VALUE]] : i64, num=[[I64VALUE]] : i64}) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }
-// CHECK:      acc.loop gang(dim=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64) {
+// CHECK:      acc.loop gang({dim=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64}) {
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
 // CHECK-NEXT: }

From 5fd18bdef9e1f18d6069a542551f046f1a179b38 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 4 Jan 2024 16:47:09 -0800
Subject: [PATCH 299/313] Revert "XFAIL test with dsymutil"

This reverts commit c041fa1093c3ad7be040fb362a10ca3900c698a4 as Adrian
added support to dsymutil.
---
 .../functionalities/inline-sourcefile/TestInlineSourceFiles.py  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py b/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
index ce7ac6fc503ed..20ed0ce00661f 100644
--- a/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
+++ b/lldb/test/API/functionalities/inline-sourcefile/TestInlineSourceFiles.py
@@ -8,8 +8,6 @@
 class InlineSourceFilesTestCase(TestBase):
     @skipIf(compiler="gcc")
     @skipIf(compiler="clang", compiler_version=["<", "18.0"])
-    # dsymutil doesn't yet copy the sources
-    @expectedFailureDarwin(debug_info=["dsym"])
     def test(self):
         """Test DWARF inline source files."""
         self.build()

From c1eef483b2c1ab2564e0ee1e4d1a30db11f8049f Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday@polymagelabs.com>
Date: Fri, 5 Jan 2024 06:35:22 +0530
Subject: [PATCH 300/313] [MLIR] Support interrupting AffineExpr walks (#74792)

Support WalkResult for AffineExpr walk and support interrupting walks
along the lines of Operation::walk. This allows interrupted walks when a
condition is met. Also, switch from std::function to llvm::function_ref
for the walk function.
---
 mlir/include/mlir/IR/AffineExpr.h        | 19 +++++++-
 mlir/include/mlir/IR/AffineExprVisitor.h | 56 ++++++++++++++++++----
 mlir/lib/Dialect/Affine/Utils/Utils.cpp  | 60 ++++++++++++------------
 mlir/lib/IR/AffineExpr.cpp               | 41 +++++++++++-----
 mlir/test/IR/affine-walk.mlir            |  9 ++++
 mlir/test/lib/IR/CMakeLists.txt          |  1 +
 mlir/test/lib/IR/TestAffineWalk.cpp      | 57 ++++++++++++++++++++++
 mlir/tools/mlir-opt/mlir-opt.cpp         | 12 +++--
 8 files changed, 196 insertions(+), 59 deletions(-)
 create mode 100644 mlir/test/IR/affine-walk.mlir
 create mode 100644 mlir/test/lib/IR/TestAffineWalk.cpp

diff --git a/mlir/include/mlir/IR/AffineExpr.h b/mlir/include/mlir/IR/AffineExpr.h
index 40e9d28ce5d3a..63314cc756355 100644
--- a/mlir/include/mlir/IR/AffineExpr.h
+++ b/mlir/include/mlir/IR/AffineExpr.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_IR_AFFINEEXPR_H
 #define MLIR_IR_AFFINEEXPR_H
 
+#include "mlir/IR/Visitors.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Hashing.h"
@@ -123,8 +124,13 @@ class AffineExpr {
   /// Return true if the affine expression involves AffineSymbolExpr `position`.
   bool isFunctionOfSymbol(unsigned position) const;
 
-  /// Walk all of the AffineExpr's in this expression in postorder.
-  void walk(std::function<void(AffineExpr)> callback) const;
+  /// Walk all of the AffineExpr's in this expression in postorder. This allows
+  /// a lambda walk function that can either return `void` or a WalkResult. With
+  /// a WalkResult, interrupting is supported.
+  template <typename FnT, typename RetT = detail::walkResultType<FnT>>
+  RetT walk(FnT &&callback) const {
+    return walk<RetT>(*this, callback);
+  }
 
   /// This method substitutes any uses of dimensions and symbols (e.g.
   /// dim#0 with dimReplacements[0]) and returns the modified expression tree.
@@ -202,6 +208,15 @@ class AffineExpr {
 
 protected:
   ImplType *expr{nullptr};
+
+private:
+  /// A trampoline for the templated non-static AffineExpr::walk method to
+  /// dispatch lambda `callback`'s of either a void result type or a
+  /// WalkResult type. Walk all of the AffineExprs in `e` in postorder. Users
+  /// should use the regular (non-static) `walk` method.
+  template <typename WalkRetTy>
+  static WalkRetTy walk(AffineExpr e,
+                        function_ref<WalkRetTy(AffineExpr)> callback);
 };
 
 /// Affine binary operation expression. An affine binary operation could be an
diff --git a/mlir/include/mlir/IR/AffineExprVisitor.h b/mlir/include/mlir/IR/AffineExprVisitor.h
index 2860e73c8f428..3e1bbb4b3fa0e 100644
--- a/mlir/include/mlir/IR/AffineExprVisitor.h
+++ b/mlir/include/mlir/IR/AffineExprVisitor.h
@@ -30,6 +30,9 @@ namespace mlir {
 /// functions in your class. This class is defined in terms of statically
 /// resolved overloading, not virtual functions.
 ///
+/// The visitor is templated on its return type (`RetTy`). With a WalkResult
+/// return type, the visitor supports interrupting walks.
+///
 /// For example, here is a visitor that counts the number of for AffineDimExprs
 /// in an AffineExpr.
 ///
@@ -65,7 +68,6 @@ namespace mlir {
 /// virtual function call overhead. Defining and using a AffineExprVisitor is
 /// just as efficient as having your own switch instruction over the instruction
 /// opcode.
-
 template <typename SubClass, typename RetTy>
 class AffineExprVisitorBase {
 public:
@@ -136,6 +138,8 @@ class AffineExprVisitorBase {
   RetTy visitSymbolExpr(AffineSymbolExpr expr) { return RetTy(); }
 };
 
+/// See documentation for AffineExprVisitorBase. This visitor supports
+/// interrupting walks when a `WalkResult` is used for `RetTy`.
 template <typename SubClass, typename RetTy = void>
 class AffineExprVisitor : public AffineExprVisitorBase<SubClass, RetTy> {
   //===--------------------------------------------------------------------===//
@@ -150,27 +154,52 @@ class AffineExprVisitor : public AffineExprVisitorBase<SubClass, RetTy> {
     switch (expr.getKind()) {
     case AffineExprKind::Add: {
       auto binOpExpr = cast<AffineBinaryOpExpr>(expr);
-      walkOperandsPostOrder(binOpExpr);
+      if constexpr (std::is_same<RetTy, WalkResult>::value) {
+        if (walkOperandsPostOrder(binOpExpr).wasInterrupted())
+          return WalkResult::interrupt();
+      } else {
+        walkOperandsPostOrder(binOpExpr);
+      }
       return self->visitAddExpr(binOpExpr);
     }
     case AffineExprKind::Mul: {
       auto binOpExpr = cast<AffineBinaryOpExpr>(expr);
-      walkOperandsPostOrder(binOpExpr);
+      if constexpr (std::is_same<RetTy, WalkResult>::value) {
+        if (walkOperandsPostOrder(binOpExpr).wasInterrupted())
+          return WalkResult::interrupt();
+      } else {
+        walkOperandsPostOrder(binOpExpr);
+      }
       return self->visitMulExpr(binOpExpr);
     }
     case AffineExprKind::Mod: {
       auto binOpExpr = cast<AffineBinaryOpExpr>(expr);
-      walkOperandsPostOrder(binOpExpr);
+      if constexpr (std::is_same<RetTy, WalkResult>::value) {
+        if (walkOperandsPostOrder(binOpExpr).wasInterrupted())
+          return WalkResult::interrupt();
+      } else {
+        walkOperandsPostOrder(binOpExpr);
+      }
       return self->visitModExpr(binOpExpr);
     }
     case AffineExprKind::FloorDiv: {
       auto binOpExpr = cast<AffineBinaryOpExpr>(expr);
-      walkOperandsPostOrder(binOpExpr);
+      if constexpr (std::is_same<RetTy, WalkResult>::value) {
+        if (walkOperandsPostOrder(binOpExpr).wasInterrupted())
+          return WalkResult::interrupt();
+      } else {
+        walkOperandsPostOrder(binOpExpr);
+      }
       return self->visitFloorDivExpr(binOpExpr);
     }
     case AffineExprKind::CeilDiv: {
       auto binOpExpr = cast<AffineBinaryOpExpr>(expr);
-      walkOperandsPostOrder(binOpExpr);
+      if constexpr (std::is_same<RetTy, WalkResult>::value) {
+        if (walkOperandsPostOrder(binOpExpr).wasInterrupted())
+          return WalkResult::interrupt();
+      } else {
+        walkOperandsPostOrder(binOpExpr);
+      }
       return self->visitCeilDivExpr(binOpExpr);
     }
     case AffineExprKind::Constant:
@@ -186,8 +215,19 @@ class AffineExprVisitor : public AffineExprVisitorBase<SubClass, RetTy> {
 private:
   // Walk the operands - each operand is itself walked in post order.
   RetTy walkOperandsPostOrder(AffineBinaryOpExpr expr) {
-    walkPostOrder(expr.getLHS());
-    walkPostOrder(expr.getRHS());
+    if constexpr (std::is_same<RetTy, WalkResult>::value) {
+      if (walkPostOrder(expr.getLHS()).wasInterrupted())
+        return WalkResult::interrupt();
+    } else {
+      walkPostOrder(expr.getLHS());
+    }
+    if constexpr (std::is_same<RetTy, WalkResult>::value) {
+      if (walkPostOrder(expr.getLHS()).wasInterrupted())
+        return WalkResult::interrupt();
+      return WalkResult::advance();
+    } else {
+      return walkPostOrder(expr.getRHS());
+    }
   }
 };
 
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 50a052fb8b74e..578d03c629285 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -1561,22 +1561,21 @@ static LogicalResult getTileSizePos(
 /// memref<4x?xf32, #map0>  ==>  memref<4x?x?xf32>
 static bool
 isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
-                             SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
-                             MLIRContext *context) {
-  bool isDynamicDim = false;
+                             SmallVectorImpl<unsigned> &inMemrefTypeDynDims) {
   AffineExpr expr = layoutMap.getResults()[dim];
   // Check if affine expr of the dimension includes dynamic dimension of input
   // memrefType.
-  expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
-    if (isa<AffineDimExpr>(e)) {
-      for (unsigned dm : inMemrefTypeDynDims) {
-        if (e == getAffineDimExpr(dm, context)) {
-          isDynamicDim = true;
-        }
-      }
-    }
-  });
-  return isDynamicDim;
+  MLIRContext *context = layoutMap.getContext();
+  return expr
+      .walk([&](AffineExpr e) {
+        if (isa<AffineDimExpr>(e) &&
+            llvm::any_of(inMemrefTypeDynDims, [&](unsigned dim) {
+              return e == getAffineDimExpr(dim, context);
+            }))
+          return WalkResult::interrupt();
+        return WalkResult::advance();
+      })
+      .wasInterrupted();
 }
 
 /// Create affine expr to calculate dimension size for a tiled-layout map.
@@ -1792,29 +1791,28 @@ MemRefType mlir::affine::normalizeMemRefType(MemRefType memrefType) {
   MLIRContext *context = memrefType.getContext();
   for (unsigned d = 0; d < newRank; ++d) {
     // Check if this dimension is dynamic.
-    bool isDynDim =
-        isNormalizedMemRefDynamicDim(d, layoutMap, memrefTypeDynDims, context);
-    if (isDynDim) {
+    if (bool isDynDim =
+            isNormalizedMemRefDynamicDim(d, layoutMap, memrefTypeDynDims)) {
       newShape[d] = ShapedType::kDynamic;
-    } else {
-      // The lower bound for the shape is always zero.
-      std::optional<int64_t> ubConst = fac.getConstantBound64(BoundType::UB, d);
-      // For a static memref and an affine map with no symbols, this is
-      // always bounded. However, when we have symbols, we may not be able to
-      // obtain a constant upper bound. Also, mapping to a negative space is
-      // invalid for normalization.
-      if (!ubConst.has_value() || *ubConst < 0) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "can't normalize map due to unknown/invalid upper bound");
-        return memrefType;
-      }
-      // If dimension of new memrefType is dynamic, the value is -1.
-      newShape[d] = *ubConst + 1;
+      continue;
+    }
+    // The lower bound for the shape is always zero.
+    std::optional<int64_t> ubConst = fac.getConstantBound64(BoundType::UB, d);
+    // For a static memref and an affine map with no symbols, this is
+    // always bounded. However, when we have symbols, we may not be able to
+    // obtain a constant upper bound. Also, mapping to a negative space is
+    // invalid for normalization.
+    if (!ubConst.has_value() || *ubConst < 0) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "can't normalize map due to unknown/invalid upper bound");
+      return memrefType;
     }
+    // If dimension of new memrefType is dynamic, the value is -1.
+    newShape[d] = *ubConst + 1;
   }
 
   // Create the new memref type after trivializing the old layout map.
-  MemRefType newMemRefType =
+  auto newMemRefType =
       MemRefType::Builder(memrefType)
           .setShape(newShape)
           .setLayout(AffineMapAttr::get(
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 038ceea286a36..a90b264a8edd2 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -26,22 +26,37 @@ MLIRContext *AffineExpr::getContext() const { return expr->context; }
 
 AffineExprKind AffineExpr::getKind() const { return expr->kind; }
 
-/// Walk all of the AffineExprs in this subgraph in postorder.
-void AffineExpr::walk(std::function<void(AffineExpr)> callback) const {
-  struct AffineExprWalker : public AffineExprVisitor<AffineExprWalker> {
-    std::function<void(AffineExpr)> callback;
-
-    AffineExprWalker(std::function<void(AffineExpr)> callback)
-        : callback(std::move(callback)) {}
-
-    void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) { callback(expr); }
-    void visitConstantExpr(AffineConstantExpr expr) { callback(expr); }
-    void visitDimExpr(AffineDimExpr expr) { callback(expr); }
-    void visitSymbolExpr(AffineSymbolExpr expr) { callback(expr); }
+/// Walk all of the AffineExprs in `e` in postorder. This is a private factory
+/// method to help handle lambda walk functions. Users should use the regular
+/// (non-static) `walk` method.
+template <typename WalkRetTy>
+WalkRetTy mlir::AffineExpr::walk(AffineExpr e,
+                                 function_ref<WalkRetTy(AffineExpr)> callback) {
+  struct AffineExprWalker
+      : public AffineExprVisitor<AffineExprWalker, WalkRetTy> {
+    function_ref<WalkRetTy(AffineExpr)> callback;
+
+    AffineExprWalker(function_ref<WalkRetTy(AffineExpr)> callback)
+        : callback(callback) {}
+
+    WalkRetTy visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {
+      return callback(expr);
+    }
+    WalkRetTy visitConstantExpr(AffineConstantExpr expr) {
+      return callback(expr);
+    }
+    WalkRetTy visitDimExpr(AffineDimExpr expr) { return callback(expr); }
+    WalkRetTy visitSymbolExpr(AffineSymbolExpr expr) { return callback(expr); }
   };
 
-  AffineExprWalker(std::move(callback)).walkPostOrder(*this);
+  return AffineExprWalker(callback).walkPostOrder(e);
 }
+// Explicitly instantiate for the two supported return types.
+template void mlir::AffineExpr::walk(AffineExpr e,
+                                     function_ref<void(AffineExpr)> callback);
+template WalkResult
+mlir::AffineExpr::walk(AffineExpr e,
+                       function_ref<WalkResult(AffineExpr)> callback);
 
 // Dispatch affine expression construction based on kind.
 AffineExpr mlir::getAffineBinaryOpExpr(AffineExprKind kind, AffineExpr lhs,
diff --git a/mlir/test/IR/affine-walk.mlir b/mlir/test/IR/affine-walk.mlir
new file mode 100644
index 0000000000000..1de675ac70be2
--- /dev/null
+++ b/mlir/test/IR/affine-walk.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt -test-affine-walk -verify-diagnostics %s
+
+// Test affine walk interrupt. A remark should be printed only for the first mod
+// expression encountered in post order.
+
+#map = affine_map<(i, j) -> ((i mod 4) mod 2, j)>
+
+"test.check_first_mod"() {"map" = #map} : () -> ()
+// expected-remark@-1 {{mod expression}}
diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt
index 69c63fd7e524b..faaa3bb8db24c 100644
--- a/mlir/test/lib/IR/CMakeLists.txt
+++ b/mlir/test/lib/IR/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestIR
+  TestAffineWalk.cpp
   TestBytecodeRoundtrip.cpp
   TestBuiltinAttributeInterfaces.cpp
   TestBuiltinDistinctAttributes.cpp
diff --git a/mlir/test/lib/IR/TestAffineWalk.cpp b/mlir/test/lib/IR/TestAffineWalk.cpp
new file mode 100644
index 0000000000000..8361b48ce4285
--- /dev/null
+++ b/mlir/test/lib/IR/TestAffineWalk.cpp
@@ -0,0 +1,57 @@
+//===- TestAffineWalk.cpp - Pass to test affine walks
+//----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+
+#include "mlir/IR/BuiltinOps.h"
+
+using namespace mlir;
+
+namespace {
+/// A test pass for verifying walk interrupts.
+struct TestAffineWalk
+    : public PassWrapper<TestAffineWalk, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAffineWalk)
+
+  void runOnOperation() override;
+  StringRef getArgument() const final { return "test-affine-walk"; }
+  StringRef getDescription() const final { return "Test affine walk method."; }
+};
+} // namespace
+
+/// Emits a remark for the first `map`'s result expression that contains a
+/// mod expression.
+static void checkMod(AffineMap map, Location loc) {
+  for (AffineExpr e : map.getResults()) {
+    e.walk([&](AffineExpr s) {
+      if (s.getKind() == mlir::AffineExprKind::Mod) {
+        emitRemark(loc, "mod expression: ");
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+  }
+}
+
+void TestAffineWalk::runOnOperation() {
+  auto m = getOperation();
+  // Test whether the walk is being correctly interrupted.
+  m.walk([](Operation *op) {
+    for (NamedAttribute attr : op->getAttrs()) {
+      auto mapAttr = attr.getValue().dyn_cast<AffineMapAttr>();
+      if (!mapAttr)
+        return;
+      checkMod(mapAttr.getAffineMap(), op->getLoc());
+    }
+  });
+}
+
+namespace mlir {
+void registerTestAffineWalk() { PassRegistration<TestAffineWalk>(); }
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 3fce7a7eea9ec..09ff66e07957a 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -44,11 +44,12 @@ void registerSymbolTestPasses();
 void registerRegionTestPasses();
 void registerTestAffineDataCopyPass();
 void registerTestAffineReifyValueBoundsPass();
+void registerTestAffineLoopUnswitchingPass();
+void registerTestAffineWalk();
 void registerTestBytecodeRoundtripPasses();
 void registerTestDecomposeAffineOpPass();
-void registerTestAffineLoopUnswitchingPass();
-void registerTestGpuLoweringPasses();
 void registerTestFunc();
+void registerTestGpuLoweringPasses();
 void registerTestGpuMemoryPromotionPass();
 void registerTestLoopPermutationPass();
 void registerTestMatchers();
@@ -167,12 +168,13 @@ void registerTestPasses() {
   registerSymbolTestPasses();
   registerRegionTestPasses();
   registerTestAffineDataCopyPass();
-  registerTestAffineReifyValueBoundsPass();
-  registerTestDecomposeAffineOpPass();
   registerTestAffineLoopUnswitchingPass();
-  registerTestGpuLoweringPasses();
+  registerTestAffineReifyValueBoundsPass();
+  registerTestAffineWalk();
   registerTestBytecodeRoundtripPasses();
+  registerTestDecomposeAffineOpPass();
   registerTestFunc();
+  registerTestGpuLoweringPasses();
   registerTestGpuMemoryPromotionPass();
   registerTestLoopPermutationPass();
   registerTestMatchers();

From 47685633a7dc74451acbc551b111929166d4d0bd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 5 Jan 2024 08:35:07 +0700
Subject: [PATCH 301/313] AMDGPU: Make v4bf16 a legal type (#76217)

Gets a few code quality improvements. A few cases are worse
from losing load narrowing.
Depends #76213 #76214 #76215
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |    19 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |    25 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |    13 +
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 11487 +++++++---------
 llvm/test/CodeGen/AMDGPU/function-args.ll     |    61 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll    |   101 +-
 llvm/test/CodeGen/AMDGPU/select-undef.ll      |     3 +-
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |   399 +-
 8 files changed, 5678 insertions(+), 6430 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b70d33d58b748..2f663571a8f97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -389,15 +389,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                      Custom);
   setOperationAction(
       ISD::EXTRACT_SUBVECTOR,
-      {MVT::v2f16,  MVT::v2i16,  MVT::v4f16,  MVT::v4i16,  MVT::v2f32,
-       MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,  MVT::v4i32,
-       MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,  MVT::v7f32,
-       MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v9f32,  MVT::v9i32,
-       MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
-       MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
-       MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,  MVT::v3f64,
-       MVT::v3i64,  MVT::v4f64,  MVT::v4i64,  MVT::v8f64,  MVT::v8i64,
-       MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
+      {MVT::v2f16,  MVT::v2i16,  MVT::v2bf16, MVT::v4f16,  MVT::v4i16,
+       MVT::v4bf16, MVT::v2f32,  MVT::v2i32,  MVT::v3f32,  MVT::v3i32,
+       MVT::v4f32,  MVT::v4i32,  MVT::v5f32,  MVT::v5i32,  MVT::v6f32,
+       MVT::v6i32,  MVT::v7f32,  MVT::v7i32,  MVT::v8f32,  MVT::v8i32,
+       MVT::v9f32,  MVT::v9i32,  MVT::v10i32, MVT::v10f32, MVT::v11i32,
+       MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16i16,
+       MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64,
+       MVT::v2i64,  MVT::v3f64,  MVT::v3i64,  MVT::v4f64,  MVT::v4i64,
+       MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64, MVT::v32i16,
+       MVT::v32f16},
       Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a89ef658734b2..c84a0934ca813 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -164,6 +164,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
+    addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
     addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
     addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
@@ -312,10 +313,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
        {MVT::v8i32,  MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32,
         MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
         MVT::v16i32, MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16,
-        MVT::v4f16,  MVT::v3i64,  MVT::v3f64,  MVT::v6i32,  MVT::v6f32,
-        MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,  MVT::v8i16,
-        MVT::v8f16,  MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
-        MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
+        MVT::v4f16,  MVT::v4bf16, MVT::v3i64,  MVT::v3f64,  MVT::v6i32,
+        MVT::v6f32,  MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,
+        MVT::v8i16,  MVT::v8f16,  MVT::v16i16, MVT::v16f16, MVT::v16i64,
+        MVT::v16f64, MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -421,13 +422,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                      {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
                      Expand);
 
-  setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
+                     Custom);
 
   // Avoid stack access for these.
   // TODO: Generalize to more vector types.
   setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
                      {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
-                      MVT::v8i8, MVT::v4i16, MVT::v4f16},
+                      MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
                      Custom);
 
   // Deal with vec3 vector operations when widened to vec4.
@@ -667,11 +669,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
+    setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
 
     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+    setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
 
     setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
     AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
@@ -781,7 +787,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        Custom);
 
     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
-    setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
+    setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
+                       Custom);
 
     if (Subtarget->hasPackedFP32Ops()) {
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
@@ -6805,7 +6812,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
   SDLoc SL(Op);
   EVT VT = Op.getValueType();
 
-  if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+  if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
       VT == MVT::v8i16 || VT == MVT::v8f16) {
     EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
                                   VT.getVectorNumElements() / 2);
@@ -6871,7 +6878,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
   }
 
-  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
+  assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
 
   SDValue Lo = Op.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c299206079670..e8c4d805dbba9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1548,6 +1548,19 @@ def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v2i32, f64, VReg_64>;
 def : BitConvert <v4i16, v4f16, VReg_64>;
 def : BitConvert <v4f16, v4i16, VReg_64>;
+def : BitConvert <v4bf16, v2i32, VReg_64>;
+def : BitConvert <v2i32, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, i64, VReg_64>;
+def : BitConvert <i64, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v4i16, VReg_64>;
+def : BitConvert <v4i16, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v4f16, VReg_64>;
+def : BitConvert <v4f16, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v2f32, VReg_64>;
+def : BitConvert <v2f32, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, f64, VReg_64>;
+def : BitConvert <f64, v4bf16, VReg_64>;
+
 
 // FIXME: Make SGPR
 def : BitConvert <v2i32, v4f16, VReg_64>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 5243262117e72..2a3417e241855 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -4133,9 +4133,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_b32 v2, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
+; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
@@ -4397,18 +4395,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX11-NEXT:    scratch_store_b32 v6, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v4, v2, off offset:8 dlc
+; GFX11-NEXT:    scratch_store_b64 v4, v[2:3], off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v4, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v4, v0, off dlc
+; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -4759,28 +4751,18 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 28, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 24, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 20, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 12, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 24, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    scratch_store_b32 v10, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v11, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v12, v5, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v8, v4, off offset:16 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v13, v3, off dlc
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX11-NEXT:    scratch_store_b64 v10, v[6:7], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v8, v2, off offset:8 dlc
+; GFX11-NEXT:    scratch_store_b64 v8, v[4:5], off offset:16 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v8, v1, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b64 v8, v[2:3], off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v8, v0, off dlc
+; GFX11-NEXT:    scratch_store_b64 v8, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -8691,83 +8673,52 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fadd_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -8818,82 +8769,81 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -8977,138 +8927,138 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_add_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_add_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_add_f32_e32 v6, v12, v11
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add_f32_e32 v4, v11, v10
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_add_f32_e32 v5, v10, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_add_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX11-NEXT:    v_dual_add_f32 v10, v11, v10 :: v_dual_add_f32 v11, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -9260,252 +9210,254 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_add_f32_e32 v8, v16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_add_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v8, v16, v8
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v9
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v9
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v9
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
 ; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_add_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_add_f32_e32 v13, v21, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_add_f32_e32 v19, v20, v19
+; GFX10-NEXT:    v_add_f32_e32 v20, v22, v21
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_add_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_add_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
-; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v10
-; GFX10-NEXT:    v_add_f32_e32 v9, v16, v11
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_add_f32_e32 v10, v18, v17
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v11
-; GFX10-NEXT:    v_add_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_add_f32_e32 v12, v17, v16
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
-; GFX11-NEXT:    v_add_f32_e32 v9, v20, v19
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_add_f32_e32 v8, v18, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX11-NEXT:    v_add_f32_e32 v13, v21, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v11
-; GFX11-NEXT:    v_add_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_add_f32_e32 v12, v17, v16
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_add_f32 v7, v7, v15
-; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
+; GFX11-NEXT:    v_add_f32_e32 v14, v19, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX11-NEXT:    v_dual_add_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v19, v20, v19 :: v_dual_add_f32 v20, v22, v21
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -9913,483 +9865,480 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v30, v14, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_add_f32_e32 v14, v32, v14
+; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_add_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v17
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_add_f32_e32 v8, v8, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_add_f32_e32 v9, v9, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_add_f32_e32 v10, v10, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_add_f32_e32 v11, v11, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_add_f32_e32 v12, v12, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_add_f32_e32 v13, v13, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT:    v_add_f32_e32 v14, v14, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v29, v32, v29
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_add_f32_e32 v28, v32, v28
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_add_f32_e32 v27, v32, v27
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_add_f32_e32 v26, v32, v26
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX9-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v24, v33, v24
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v22, v33, v22
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v21, v33, v21
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v20, v33, v20
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v19, v33, v19
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_add_f32_e32 v18, v33, v18
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v17
-; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v17
-; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v17
-; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v17
-; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v17
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v17
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v17
-; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v17
-; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v17
-; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v17
-; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v17
-; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v17
-; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v17
-; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v17, v33, v17
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_add_f32_e32 v21, v53, v52
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_add_f32_e32 v22, v55, v54
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX10-NEXT:    v_add_f32_e32 v32, v33, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX10-NEXT:    v_add_f32_e32 v34, v35, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX10-NEXT:    v_add_f32_e32 v36, v37, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_add_f32_e32 v25, v54, v53
+; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_add_f32_e32 v24, v64, v55
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_add_f32_e32 v23, v66, v65
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_add_f32_e32 v22, v68, v67
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_add_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_add_f32_e32 v29, v29, v36
+; GFX10-NEXT:    v_add_f32_e32 v28, v28, v38
+; GFX10-NEXT:    v_add_f32_e32 v27, v27, v48
+; GFX10-NEXT:    v_add_f32_e32 v26, v26, v50
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; GFX10-NEXT:    v_add_f32_e32 v38, v39, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; GFX10-NEXT:    v_add_f32_e32 v48, v49, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX10-NEXT:    v_add_f32_e32 v50, v51, v50
-; GFX10-NEXT:    v_add_f32_e32 v23, v65, v64
-; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_add_f32_e32 v24, v67, v66
-; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_add_f32_e32 v25, v33, v68
-; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_add_f32_e32 v16, v35, v16
-; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_add_f32_e32 v17, v37, v17
-; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_add_f32_e32 v18, v39, v18
-; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_add_f32_e32 v19, v49, v19
-; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
-; GFX10-NEXT:    v_add_f32_e32 v20, v20, v21
-; GFX10-NEXT:    v_add_f32_e32 v15, v15, v22
-; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
+; GFX11-NEXT:    v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v18 :: v_dual_add_f32 v3, v3, v19
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
+; GFX11-NEXT:    v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
 ; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX11-NEXT:    v_add_f32_e32 v26, v52, v51
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_add_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
-; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v9, v9, v25
-; GFX11-NEXT:    v_add_f32_e32 v25, v69, v68
-; GFX11-NEXT:    v_dual_add_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v27, v81, v80
-; GFX11-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX11-NEXT:    v_dual_add_f32 v28, v83, v82 :: v_dual_add_f32 v29, v85, v84
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v22, v55, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_add_f32_e32 v24, v64, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_add_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_add_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_add_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_add_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_add_f32_e32 v37, v86, v85
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
 ; GFX11-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX11-NEXT:    v_dual_add_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_add_f32_e32 v23, v65, v64
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_add_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
-; GFX11-NEXT:    v_add_f32_e32 v18, v39, v38
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_add_f32_e32 v19, v49, v48
-; GFX11-NEXT:    v_add_f32_e32 v17, v37, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v21, v53, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX11-NEXT:    v_add_f32_e32 v16, v35, v34
-; GFX11-NEXT:    v_add_f32_e32 v32, v33, v32
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
+; GFX11-NEXT:    v_dual_add_f32 v34, v80, v71 :: v_dual_add_f32 v35, v82, v81
+; GFX11-NEXT:    v_add_f32_e32 v36, v84, v83
+; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <32 x bfloat> %a, %b
@@ -10681,83 +10630,52 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_sub_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fsub_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_sub_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -10808,82 +10726,81 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_sub_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_sub_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11068,83 +10985,52 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fmul_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -11195,82 +11081,81 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11354,138 +11239,138 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_mul_f32_e32 v4, v8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_mul_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v8, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_mul_f32_e32 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_mul_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_mul_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v12, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_mul_f32_e32 v4, v11, v10
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_mul_f32_e32 v5, v10, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_mul_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX11-NEXT:    v_dual_mul_f32 v10, v11, v10 :: v_dual_mul_f32 v11, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -11637,252 +11522,254 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_mul_f32_e32 v8, v16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: v_fmul_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_mul_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v8, v16, v8
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v9
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v9
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v9
-; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v9
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_mul_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_mul_f32_e32 v13, v21, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v18, v20, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_mul_f32_e32 v19, v20, v19
+; GFX10-NEXT:    v_mul_f32_e32 v20, v22, v21
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_mul_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_mul_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
-; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v10
-; GFX10-NEXT:    v_mul_f32_e32 v9, v16, v11
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_mul_f32_e32 v10, v18, v17
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v11
-; GFX10-NEXT:    v_mul_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_mul_f32_e32 v12, v17, v16
-; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
-; GFX11-NEXT:    v_mul_f32_e32 v9, v20, v19
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_mul_f32_e32 v8, v18, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_mul_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX11-NEXT:    v_mul_f32_e32 v13, v21, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v11
-; GFX11-NEXT:    v_mul_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_mul_f32_e32 v12, v17, v16
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_mul_f32 v7, v7, v15
-; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
+; GFX11-NEXT:    v_mul_f32_e32 v14, v19, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX11-NEXT:    v_dual_mul_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v19, v20, v19 :: v_dual_mul_f32 v20, v22, v21
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -12290,483 +12177,480 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v30, v14, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_mul_f32_e32 v14, v32, v14
+; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_mul_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_mul_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_mul_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v17
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_mul_f32_e32 v29, v32, v29
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_mul_f32_e32 v28, v32, v28
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_mul_f32_e32 v27, v32, v27
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_mul_f32_e32 v26, v32, v26
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX9-NEXT:    v_mul_f32_e32 v25, v32, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v24, v33, v24
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v22, v33, v22
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v21, v33, v21
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v20, v33, v20
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v19, v33, v19
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_mul_f32_e32 v18, v33, v18
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v17
-; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v17
-; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v17
-; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v17
-; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v17
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v17
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v17
-; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v17
-; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v17
-; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v17
-; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v17
-; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v17
-; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v17
-; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v17, v33, v17
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_mul_f32_e32 v21, v53, v52
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_mul_f32_e32 v22, v55, v54
-; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_mul_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX10-NEXT:    v_mul_f32_e32 v32, v33, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX10-NEXT:    v_mul_f32_e32 v34, v35, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX10-NEXT:    v_mul_f32_e32 v36, v37, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_mul_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_mul_f32_e32 v25, v54, v53
+; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_mul_f32_e32 v24, v64, v55
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_mul_f32_e32 v23, v66, v65
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_mul_f32_e32 v22, v68, v67
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_mul_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_mul_f32_e32 v29, v29, v36
+; GFX10-NEXT:    v_mul_f32_e32 v28, v28, v38
+; GFX10-NEXT:    v_mul_f32_e32 v27, v27, v48
+; GFX10-NEXT:    v_mul_f32_e32 v26, v26, v50
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; GFX10-NEXT:    v_mul_f32_e32 v38, v39, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; GFX10-NEXT:    v_mul_f32_e32 v48, v49, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX10-NEXT:    v_mul_f32_e32 v50, v51, v50
-; GFX10-NEXT:    v_mul_f32_e32 v23, v65, v64
-; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_mul_f32_e32 v24, v67, v66
-; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_mul_f32_e32 v25, v33, v68
-; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_mul_f32_e32 v16, v35, v16
-; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_mul_f32_e32 v17, v37, v17
-; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_mul_f32_e32 v18, v39, v18
-; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_mul_f32_e32 v19, v49, v19
-; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
-; GFX10-NEXT:    v_mul_f32_e32 v20, v20, v21
-; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v22
-; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX10-NEXT:    v_mul_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
+; GFX11-NEXT:    v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v18 :: v_dual_mul_f32 v3, v3, v19
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
+; GFX11-NEXT:    v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
 ; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX11-NEXT:    v_mul_f32_e32 v26, v52, v51
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_mul_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
-; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v9, v9, v25
-; GFX11-NEXT:    v_mul_f32_e32 v25, v69, v68
-; GFX11-NEXT:    v_dual_mul_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
-; GFX11-NEXT:    v_mul_f32_e32 v27, v81, v80
-; GFX11-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX11-NEXT:    v_dual_mul_f32 v28, v83, v82 :: v_dual_mul_f32 v29, v85, v84
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v22, v55, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_mul_f32_e32 v24, v64, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_mul_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_mul_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_mul_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_mul_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_mul_f32_e32 v37, v86, v85
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
 ; GFX11-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX11-NEXT:    v_dual_mul_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_mul_f32_e32 v23, v65, v64
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_mul_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
-; GFX11-NEXT:    v_mul_f32_e32 v18, v39, v38
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_mul_f32_e32 v19, v49, v48
-; GFX11-NEXT:    v_mul_f32_e32 v17, v37, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v21, v53, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX11-NEXT:    v_mul_f32_e32 v16, v35, v34
-; GFX11-NEXT:    v_mul_f32_e32 v32, v33, v32
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
+; GFX11-NEXT:    v_dual_mul_f32 v34, v80, v71 :: v_dual_mul_f32 v35, v82, v81
+; GFX11-NEXT:    v_mul_f32_e32 v36, v84, v83
+; GFX11-NEXT:    v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <32 x bfloat> %a, %b
@@ -13396,83 +13280,52 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_min_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_minnum_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_min_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -13539,82 +13392,81 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_min_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_min_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -13730,138 +13582,138 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_min_f32_e32 v4, v8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_min_f32_e32 v6, v9, v6
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_min_f32_e32 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_min_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_min_f32_e32 v6, v12, v11
 ; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_min_f32_e32 v4, v11, v10
-; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_min_f32_e32 v5, v10, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_min_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX11-NEXT:    v_dual_min_f32 v10, v11, v10 :: v_dual_min_f32 v11, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -14077,252 +13929,254 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_min_f32_e32 v8, v16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_min_f32_e32 v6, v6, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_min_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v8, v16, v8
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v9
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v9
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v9
-; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v9
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
 ; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_min_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_min_f32_e32 v13, v21, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v18, v20, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_min_f32_e32 v19, v20, v19
+; GFX10-NEXT:    v_min_f32_e32 v20, v22, v21
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_min_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_min_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
-; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v10
-; GFX10-NEXT:    v_min_f32_e32 v9, v16, v11
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_min_f32_e32 v10, v18, v17
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v11
-; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_min_f32_e32 v12, v17, v16
-; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
-; GFX11-NEXT:    v_min_f32_e32 v9, v20, v19
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_min_f32_e32 v8, v18, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX11-NEXT:    v_min_f32_e32 v13, v21, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v5, v5, v11
-; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_min_f32_e32 v12, v17, v16
-; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_min_f32 v7, v7, v15
-; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
+; GFX11-NEXT:    v_min_f32_e32 v14, v19, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX11-NEXT:    v_dual_min_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v19, v20, v19 :: v_dual_min_f32 v20, v22, v21
+; GFX11-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -14858,483 +14712,480 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_min_f32_e32 v16, v31, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v17
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_min_f32_e32 v6, v6, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_min_f32_e32 v30, v14, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_min_f32_e32 v14, v32, v14
+; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_min_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_min_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_min_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_min_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_min_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_min_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_min_f32_e32 v9, v9, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_min_f32_e32 v10, v10, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_min_f32_e32 v11, v11, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_min_f32_e32 v12, v12, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_min_f32_e32 v13, v13, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT:    v_min_f32_e32 v14, v14, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_min_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_min_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_min_f32_e32 v29, v32, v29
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_min_f32_e32 v28, v32, v28
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_min_f32_e32 v27, v32, v27
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_min_f32_e32 v26, v32, v26
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX9-NEXT:    v_min_f32_e32 v25, v32, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_min_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v24, v33, v24
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_min_f32_e32 v22, v33, v22
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_min_f32_e32 v21, v33, v21
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v20, v33, v20
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v19, v33, v19
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_min_f32_e32 v18, v33, v18
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v17
-; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v17
-; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v17
-; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v17
-; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v17
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v17
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v17
-; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v17
-; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v17
-; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v17
-; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v17
-; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_min_f32_e32 v13, v13, v17
-; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_min_f32_e32 v14, v14, v17
-; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_min_f32_e32 v17, v33, v17
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_min_f32_e32 v21, v53, v52
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_min_f32_e32 v22, v55, v54
-; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX10-NEXT:    v_min_f32_e32 v32, v33, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_min_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX10-NEXT:    v_min_f32_e32 v34, v35, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX10-NEXT:    v_min_f32_e32 v36, v37, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_min_f32_e32 v25, v54, v53
+; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_min_f32_e32 v24, v64, v55
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_min_f32_e32 v23, v66, v65
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_min_f32_e32 v22, v68, v67
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_min_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_min_f32_e32 v29, v29, v36
+; GFX10-NEXT:    v_min_f32_e32 v28, v28, v38
+; GFX10-NEXT:    v_min_f32_e32 v27, v27, v48
+; GFX10-NEXT:    v_min_f32_e32 v26, v26, v50
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; GFX10-NEXT:    v_min_f32_e32 v38, v39, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; GFX10-NEXT:    v_min_f32_e32 v48, v49, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX10-NEXT:    v_min_f32_e32 v50, v51, v50
-; GFX10-NEXT:    v_min_f32_e32 v23, v65, v64
-; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_min_f32_e32 v24, v67, v66
-; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_min_f32_e32 v25, v33, v68
-; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_min_f32_e32 v16, v35, v16
-; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_min_f32_e32 v17, v37, v17
-; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_min_f32_e32 v18, v39, v18
-; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_min_f32_e32 v19, v49, v19
-; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
-; GFX10-NEXT:    v_min_f32_e32 v20, v20, v21
-; GFX10-NEXT:    v_min_f32_e32 v15, v15, v22
-; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX10-NEXT:    v_min_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
+; GFX11-NEXT:    v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v18 :: v_dual_min_f32 v3, v3, v19
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
+; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
 ; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX11-NEXT:    v_min_f32_e32 v26, v52, v51
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_min_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
-; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_min_f32 v9, v9, v25
-; GFX11-NEXT:    v_min_f32_e32 v25, v69, v68
-; GFX11-NEXT:    v_dual_min_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
-; GFX11-NEXT:    v_min_f32_e32 v27, v81, v80
-; GFX11-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX11-NEXT:    v_dual_min_f32 v28, v83, v82 :: v_dual_min_f32 v29, v85, v84
-; GFX11-NEXT:    v_dual_min_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_min_f32_e32 v22, v55, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_min_f32_e32 v24, v64, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_min_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_min_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_min_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_min_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_min_f32_e32 v37, v86, v85
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
 ; GFX11-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_min_f32_e32 v23, v65, v64
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_min_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
-; GFX11-NEXT:    v_min_f32_e32 v18, v39, v38
-; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_min_f32_e32 v19, v49, v48
-; GFX11-NEXT:    v_min_f32_e32 v17, v37, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v21, v53, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX11-NEXT:    v_min_f32_e32 v16, v35, v34
-; GFX11-NEXT:    v_min_f32_e32 v32, v33, v32
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
+; GFX11-NEXT:    v_dual_min_f32 v34, v80, v71 :: v_dual_min_f32 v35, v82, v81
+; GFX11-NEXT:    v_min_f32_e32 v36, v84, v83
+; GFX11-NEXT:    v_dual_min_f32 v16, v32, v16 :: v_dual_min_f32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
@@ -15553,83 +15404,52 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_max_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_maxnum_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_max_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -15696,82 +15516,81 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_max_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_max_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -15887,138 +15706,138 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_max_f32_e32 v4, v8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_max_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v4, v8, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_max_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_max_f32_e32 v6, v12, v11
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_max_f32_e32 v4, v11, v10
-; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_max_f32_e32 v5, v10, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_max_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX11-NEXT:    v_dual_max_f32 v10, v11, v10 :: v_dual_max_f32 v11, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -16234,252 +16053,254 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_max_f32_e32 v8, v16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_max_f32_e32 v6, v6, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_max_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v8, v16, v8
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v9
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v9
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v9
-; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v9
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
 ; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_max_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_max_f32_e32 v13, v21, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v18, v20, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v19, v20, v19
+; GFX10-NEXT:    v_max_f32_e32 v20, v22, v21
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_max_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_max_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
-; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v10
-; GFX10-NEXT:    v_max_f32_e32 v9, v16, v11
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_max_f32_e32 v10, v18, v17
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v11
-; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_max_f32_e32 v12, v17, v16
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
-; GFX11-NEXT:    v_max_f32_e32 v9, v20, v19
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_max_f32_e32 v8, v18, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX11-NEXT:    v_max_f32_e32 v13, v21, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v11
-; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_max_f32_e32 v12, v17, v16
-; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v7, v7, v15
-; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
+; GFX11-NEXT:    v_max_f32_e32 v14, v19, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX11-NEXT:    v_dual_max_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v20, v19 :: v_dual_max_f32 v20, v22, v21
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -17015,483 +16836,480 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v30, v14, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_max_f32_e32 v14, v32, v14
+; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_max_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_max_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_max_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_max_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_max_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_max_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_max_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_max_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v17
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_max_f32_e32 v6, v6, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_max_f32_e32 v8, v8, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_max_f32_e32 v9, v9, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_max_f32_e32 v10, v10, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_max_f32_e32 v11, v11, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_max_f32_e32 v12, v12, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_max_f32_e32 v13, v13, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT:    v_max_f32_e32 v14, v14, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_max_f32_e32 v29, v32, v29
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_max_f32_e32 v28, v32, v28
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_max_f32_e32 v27, v32, v27
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v26, v32, v26
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX9-NEXT:    v_max_f32_e32 v25, v32, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v24, v33, v24
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v22, v33, v22
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v21, v33, v21
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v20, v33, v20
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v19, v33, v19
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_max_f32_e32 v18, v33, v18
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v17
-; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v17
-; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v17
-; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v17
-; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v17
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v17
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v17
-; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v17
-; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v17
-; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v17
-; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v17
-; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v17
-; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v17
-; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v17, v33, v17
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_max_f32_e32 v21, v53, v52
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_max_f32_e32 v22, v55, v54
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX10-NEXT:    v_max_f32_e32 v32, v33, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX10-NEXT:    v_max_f32_e32 v34, v35, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX10-NEXT:    v_max_f32_e32 v36, v37, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_max_f32_e32 v25, v54, v53
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_max_f32_e32 v24, v64, v55
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_max_f32_e32 v23, v66, v65
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_max_f32_e32 v22, v68, v67
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_max_f32_e32 v29, v29, v36
+; GFX10-NEXT:    v_max_f32_e32 v28, v28, v38
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v48
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v50
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; GFX10-NEXT:    v_max_f32_e32 v38, v39, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; GFX10-NEXT:    v_max_f32_e32 v48, v49, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX10-NEXT:    v_max_f32_e32 v50, v51, v50
-; GFX10-NEXT:    v_max_f32_e32 v23, v65, v64
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_max_f32_e32 v24, v67, v66
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_max_f32_e32 v25, v33, v68
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_max_f32_e32 v16, v35, v16
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_max_f32_e32 v17, v37, v17
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_max_f32_e32 v18, v39, v18
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_max_f32_e32 v19, v49, v19
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
-; GFX10-NEXT:    v_max_f32_e32 v20, v20, v21
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v22
-; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX10-NEXT:    v_max_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
+; GFX11-NEXT:    v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v18 :: v_dual_max_f32 v3, v3, v19
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
 ; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX11-NEXT:    v_max_f32_e32 v26, v52, v51
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_max_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_max_f32 v9, v9, v25
-; GFX11-NEXT:    v_max_f32_e32 v25, v69, v68
-; GFX11-NEXT:    v_dual_max_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
-; GFX11-NEXT:    v_max_f32_e32 v27, v81, v80
-; GFX11-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX11-NEXT:    v_dual_max_f32 v28, v83, v82 :: v_dual_max_f32 v29, v85, v84
-; GFX11-NEXT:    v_dual_max_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_max_f32_e32 v22, v55, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_max_f32_e32 v24, v64, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_max_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_max_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_max_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_max_f32_e32 v37, v86, v85
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
 ; GFX11-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v23, v65, v64
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_max_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
-; GFX11-NEXT:    v_max_f32_e32 v18, v39, v38
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_max_f32_e32 v19, v49, v48
-; GFX11-NEXT:    v_max_f32_e32 v17, v37, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v21, v53, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX11-NEXT:    v_max_f32_e32 v16, v35, v34
-; GFX11-NEXT:    v_max_f32_e32 v32, v33, v32
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
+; GFX11-NEXT:    v_dual_max_f32 v34, v80, v71 :: v_dual_max_f32 v35, v82, v81
+; GFX11-NEXT:    v_max_f32_e32 v36, v84, v83
+; GFX11-NEXT:    v_dual_max_f32 v16, v32, v16 :: v_dual_max_f32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
@@ -21043,13 +20861,13 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
@@ -21140,65 +20958,65 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi <4 x bfloat> %x to <4 x i16>
   ret <4 x i16> %op
@@ -22538,13 +22356,11 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -22552,11 +22368,10 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -22564,29 +22379,10 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_sitofp_v3i16_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v1
-; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -22629,55 +22425,55 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v3, v0, 16
 ; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -22813,12 +22609,12 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -22875,22 +22671,22 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
@@ -22900,9 +22696,9 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
@@ -22911,11 +22707,11 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -23330,130 +23126,130 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v7, v0, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX8-NEXT:    v_ffbh_i32_e32 v6, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
 ; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v7, v0
-; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v6, v4, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v7, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
-; GFX8-NEXT:    v_ldexp_f32 v2, v7, v2
-; GFX8-NEXT:    v_ldexp_f32 v3, v0, v1
-; GFX8-NEXT:    v_xor_b32_e32 v1, v4, v5
-; GFX8-NEXT:    v_ffbh_i32_e32 v0, v5
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX9-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v6
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, v0
 ; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_ldexp_f32 v2, v7, v6
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, v4, v5
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v0
-; GFX9-NEXT:    v_ffbh_i32_e32 v0, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
 ; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GFX9-NEXT:    v_ldexp_f32 v3, v3, v6
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v7, v0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v8, v2, v3
-; GFX10-NEXT:    v_xor_b32_e32 v9, v4, v5
-; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
+; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX10-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GFX10-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v10, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX10-NEXT:    v_ffbh_i32_e32 v11, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v11, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX10-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX10-NEXT:    v_min_u32_e32 v7, v10, v8
-; GFX10-NEXT:    v_min_u32_e32 v8, v11, v9
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v7, v10, v7
+; GFX10-NEXT:    v_min_u32_e32 v9, v11, v9
+; GFX10-NEXT:    v_min_u32_e32 v6, v6, v8
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
 ; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v4
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -23579,236 +23375,231 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v8, v1
+; GFX8-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX8-NEXT:    v_ffbh_i32_e32 v8, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
 ; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_xor_b32_e32 v5, v6, v7
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v9, v4
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX8-NEXT:    v_ldexp_f32 v5, v9, v6
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
+; GFX8-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX8-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
+; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v9, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v7, v0
 ; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v10, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_ldexp_f32 v3, v9, v2
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v10
-; GFX8-NEXT:    v_xor_b32_e32 v2, v4, v5
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v1, v5
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
-; GFX8-NEXT:    v_min_u32_e32 v8, v1, v2
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v1, v7, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_xor_b32_e32 v2, v6, v7
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v1, v7
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
-; GFX8-NEXT:    v_min_u32_e32 v4, v1, v2
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
-; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX8-NEXT:    v_ldexp_f32 v2, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX9-NEXT:    v_ffbh_i32_e32 v8, v1
+; GFX9-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX9-NEXT:    v_ffbh_i32_e32 v8, v5
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
 ; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
 ; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v4
+; GFX9-NEXT:    v_ffbh_i32_e32 v4, v7
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
+; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v10, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v9
+; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v3, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v10
-; GFX9-NEXT:    v_xor_b32_e32 v2, v4, v5
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_ffbh_i32_e32 v1, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
-; GFX9-NEXT:    v_min_u32_e32 v8, v1, v2
-; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_or_b32_e32 v3, v2, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, v6, v7
-; GFX9-NEXT:    v_ffbh_i32_e32 v1, v7
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
-; GFX9-NEXT:    v_min_u32_e32 v4, v1, v2
-; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v9, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
-; GFX10-NEXT:    v_xor_b32_e32 v11, v2, v3
-; GFX10-NEXT:    v_xor_b32_e32 v13, v4, v5
+; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v9, v5
+; GFX10-NEXT:    v_xor_b32_e32 v11, v6, v7
+; GFX10-NEXT:    v_xor_b32_e32 v13, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v14, v2, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
-; GFX10-NEXT:    v_xor_b32_e32 v15, v6, v7
-; GFX10-NEXT:    v_ffbh_i32_e32 v12, v5
-; GFX10-NEXT:    v_ffbh_i32_e32 v14, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_ffbh_i32_e32 v10, v7
+; GFX10-NEXT:    v_ffbh_i32_e32 v12, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, -1, v12
-; GFX10-NEXT:    v_add_nc_u32_e32 v14, -1, v14
-; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v10
-; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
-; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, 32, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v13, 32, v13
-; GFX10-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
+; GFX10-NEXT:    v_ffbh_i32_e32 v13, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, 32, v14
+; GFX10-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, -1, v13
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
+; GFX10-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX10-NEXT:    v_min_u32_e32 v11, v13, v14
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_min_u32_e32 v10, v12, v10
-; GFX10-NEXT:    v_min_u32_e32 v11, v14, v13
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_min_u32_e32 v1, 1, v4
-; GFX10-NEXT:    v_min_u32_e32 v4, 1, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v5
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_ldexp_f32 v3, v4, v7
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX11-NEXT:    v_cls_i32_e32 v9, v1
-; GFX11-NEXT:    v_cls_i32_e32 v10, v3
-; GFX11-NEXT:    v_xor_b32_e32 v11, v2, v3
-; GFX11-NEXT:    v_xor_b32_e32 v13, v4, v5
+; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX11-NEXT:    v_cls_i32_e32 v9, v5
+; GFX11-NEXT:    v_xor_b32_e32 v11, v6, v7
+; GFX11-NEXT:    v_xor_b32_e32 v13, v0, v1
+; GFX11-NEXT:    v_xor_b32_e32 v14, v2, v3
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v9
-; GFX11-NEXT:    v_xor_b32_e32 v15, v6, v7
-; GFX11-NEXT:    v_cls_i32_e32 v12, v5
-; GFX11-NEXT:    v_cls_i32_e32 v14, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_cls_i32_e32 v10, v7
+; GFX11-NEXT:    v_cls_i32_e32 v12, v1
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
 ; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, -1, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v10
-; GFX11-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
-; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
 ; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
+; GFX11-NEXT:    v_cls_i32_e32 v13, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 32, v14
+; GFX11-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
+; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 32, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 32, v13
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v10, v12, v10
-; GFX11-NEXT:    v_min_u32_e32 v11, v14, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v1, 1, v4
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v5
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
-; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX11-NEXT:    v_ldexp_f32 v3, v4, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -23961,13 +23752,11 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -23975,11 +23764,10 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -23987,29 +23775,10 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_uitofp_v3i16_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -24052,55 +23821,55 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v3, v0, 16
 ; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -24236,12 +24005,12 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -24298,22 +24067,22 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
@@ -24323,9 +24092,9 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
@@ -24334,11 +24103,11 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -24653,65 +24422,65 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v6, v5
 ; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, v0
-; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v6, v4, v5
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX8-NEXT:    v_min_u32_e32 v7, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v4
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
-; GFX8-NEXT:    v_ldexp_f32 v2, v7, v2
-; GFX8-NEXT:    v_ldexp_f32 v3, v0, v1
-; GFX8-NEXT:    v_ffbh_u32_e32 v0, v5
-; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_ldexp_f32 v2, v7, v6
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
-; GFX9-NEXT:    v_ffbh_u32_e32 v0, v5
 ; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GFX9-NEXT:    v_ldexp_f32 v3, v3, v6
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -24726,21 +24495,21 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v4
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v8
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -24834,129 +24603,129 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v7
+; GFX8-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_ldexp_f32 v5, v9, v6
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX8-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, v0
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v10, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v10
-; GFX8-NEXT:    v_ldexp_f32 v3, v9, v2
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_ffbh_u32_e32 v1, v5
-; GFX8-NEXT:    v_min_u32_e32 v8, 32, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v1, v7, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GFX8-NEXT:    v_ffbh_u32_e32 v1, v7
-; GFX8-NEXT:    v_min_u32_e32 v4, 32, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
-; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT:    v_ldexp_f32 v2, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
+; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v10, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v9
+; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v3, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v10
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_ffbh_u32_e32 v1, v5
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_or_b32_e32 v3, v2, v1
-; GFX9-NEXT:    v_ffbh_u32_e32 v1, v7
-; GFX9-NEXT:    v_min_u32_e32 v4, 32, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v8, v1
-; GFX10-NEXT:    v_ffbh_u32_e32 v9, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX10-NEXT:    v_ffbh_u32_e32 v9, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v10, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v11, v7
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX10-NEXT:    v_or_b32_e32 v4, v7, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v9
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, v7, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v11
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v8
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX10-NEXT:    v_ldexp_f32 v3, v4, v6
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX10-NEXT:    v_ldexp_f32 v4, v4, v8
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
+; GFX10-NEXT:    v_ldexp_f32 v1, v3, v1
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v1
-; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v5
+; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
+; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v3
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
@@ -24965,37 +24734,37 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v9
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v8
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_ldexp_f32 v3, v4, v6
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v7, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v11
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, v8
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
+; GFX11-NEXT:    v_ldexp_f32 v1, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -25672,18 +25441,14 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v3bf16:
@@ -25692,51 +25457,25 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
   ret <3 x bfloat> %op
@@ -25794,18 +25533,18 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v4bf16:
@@ -25814,51 +25553,25 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -25932,24 +25645,24 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v6bf16:
@@ -25958,65 +25671,28 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v6bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v5, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v6bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v3, v6, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v2 :: v_dual_cndmask_b32 v5, v10, v9
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v5, v3, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
   ret <6 x bfloat> %op
@@ -26106,30 +25782,30 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v8bf16:
@@ -26138,80 +25814,30 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v2, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v6, v3, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v7, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v5, v6, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v7, v12, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v6, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v7, v4, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -26373,54 +25999,54 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v17
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v16bf16:
@@ -26429,140 +26055,40 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
-; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v19, v0, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
-; GFX10-NEXT:    v_perm_b32 v0, v9, v1, 0x5040100
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_perm_b32 v1, v10, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v17, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v17, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v4, v9, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v5, v6, v10, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v6, v11, v7, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v7, v12, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v19, v0, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
-; GFX11-NEXT:    v_perm_b32 v0, v9, v1, 0x5040100
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX11-NEXT:    v_perm_b32 v1, v10, v2, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v18, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v12, v4 :: v_dual_cndmask_b32 v4, v10, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v17, v11 :: v_dual_cndmask_b32 v10, v14, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_cndmask_b32 v8, v16, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v17, v14, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v9, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v6, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v11, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v12, v8, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -26980,106 +26506,106 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v19, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v0, v33, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
+; GFX8-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v27
+; GFX8-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v28
+; GFX8-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
+; GFX8-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v30
+; GFX8-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
+; GFX8-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
+; GFX8-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
+; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v32bf16:
@@ -27088,90 +26614,25 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
-; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v19, v8, vcc
-; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
-; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
-; GFX9-NEXT:    v_perm_b32 v9, v10, v9, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
-; GFX9-NEXT:    v_perm_b32 v10, v11, v10, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
-; GFX9-NEXT:    v_perm_b32 v11, v12, v11, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
-; GFX9-NEXT:    v_perm_b32 v12, v13, v12, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
-; GFX9-NEXT:    v_perm_b32 v13, v14, v13, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
-; GFX9-NEXT:    v_perm_b32 v14, v15, v14, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX9-NEXT:    v_perm_b32 v15, v16, v15, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v32bf16:
@@ -27181,89 +26642,25 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v52, v51, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v54, v53, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v28
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v30
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v64, v55, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v0, v65, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v67, v66, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v68, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v33, v1, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v35, v2, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v37, v3, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v39, v4, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v4, v49, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v5, v22, v6, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v6, v23, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v34, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v38, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v7, v24, v8, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v8, v25, v9, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v9, v26, v10, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v10, v17, v11, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v11, v18, v12, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v19, v13, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v13, v20, v14, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v15, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v32
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v22, v48, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v14, v17, v15, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v15, v21, v16, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v32bf16:
@@ -27273,80 +26670,17 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v34, v33, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v36, v35, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v70, v69 :: v_dual_cndmask_b32 v12, v28, v12
-; GFX11-NEXT:    v_dual_cndmask_b32 v28, v80, v71 :: v_dual_cndmask_b32 v13, v29, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v82, v81, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v0, v83, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v17, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v18, v2, 0x5040100
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v38, v37 :: v_dual_cndmask_b32 v4, v20, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v20, v48, v39 :: v_dual_cndmask_b32 v5, v21, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v50, v49, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v52, v51 :: v_dual_cndmask_b32 v23, v54, v53
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v64, v55 :: v_dual_cndmask_b32 v9, v25, v9
-; GFX11-NEXT:    v_dual_cndmask_b32 v25, v66, v65 :: v_dual_cndmask_b32 v26, v68, v67
-; GFX11-NEXT:    v_perm_b32 v2, v19, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v20, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v21, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v22, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v23, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v24, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v25, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v26, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v27, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v28, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v29, v13, 0x5040100
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v31
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
+; GFX11-NEXT:    v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
+; GFX11-NEXT:    v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v32
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v16, v32, v16
-; GFX11-NEXT:    v_perm_b32 v13, v30, v14, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v17, v17, v84 :: v_dual_cndmask_b32 v18, v18, v85
-; GFX11-NEXT:    v_perm_b32 v14, v17, v15, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v15, v18, v16, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
@@ -27413,10 +26747,17 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s2, s3, 16
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
@@ -27424,17 +26765,10 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ;
 ; GFX9-LABEL: s_select_v3bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -27445,39 +26779,28 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ;
 ; GFX10-LABEL: s_select_v3bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v2, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_v3bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s3, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
@@ -27582,73 +26905,40 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_select_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-LABEL: s_select_v4bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v4bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, s3, v3, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_v4bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, s3, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
@@ -27887,96 +27177,95 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX8-LABEL: v_vselect_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -28062,172 +27351,170 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX8-LABEL: v_vselect_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v15, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v12, v8, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX9-NEXT:    v_perm_b32 v2, v5, v4, s4
+; GFX9-NEXT:    v_perm_b32 v3, v7, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v15
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v8 :: v_dual_and_b32 v7, 1, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v17, v16 :: v_dual_and_b32 v6, 1, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v5, 1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
@@ -28469,244 +27756,259 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-LABEL: v_vselect_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v25, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v26, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v6
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v7
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v12
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v13
+; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v30
+; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v11
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v3, v2, s[28:29]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v28
+; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v3, v2, s[20:21]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[24:25]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v27
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v27, v19, vcc
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v11
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v28, v20, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v29, v21, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v4, s[16:17]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v30, v22, s[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v27, v19, s[14:15]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v29, v21, s[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v28, v20, s[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v26, v18, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v25, v17, s[6:7]
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v30, v22, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v30
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX8-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v23, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v0, v23, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v2, v1, s[34:35]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[12:13]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v15, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v24, v16, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v13
+; GFX8-NEXT:    v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v13
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v30
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v8
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v29
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[12:13]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v27
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v13, v11, s[16:17]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v30, v22, vcc
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v26, v18, s[18:19]
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v29, v21, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v28, v20, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v27, v19, s[14:15]
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v4, v23, s[20:21]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v4, v13, s[22:23]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s[24:25]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v18, v17, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v24
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v25, v17, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s6
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v5
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v26, v18, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s6
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v7
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v27
-; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v27, v19, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v28, v20, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX9-NEXT:    v_perm_b32 v4, v8, v4, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v29, v21, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v30, v22, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v30
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v23, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX9-NEXT:    v_perm_b32 v2, v4, v15, s4
+; GFX9-NEXT:    v_perm_b32 v3, v11, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v9, v10, s4
+; GFX9-NEXT:    v_perm_b32 v7, v13, v14, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
 ; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
 ; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v52, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v54, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v38, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX10-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX10-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v54, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v32, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
@@ -28716,78 +28018,79 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v24, v16 :: v_dual_and_b32 v3, 1, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
 ; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v52, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v54, v53, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v39, v38 :: v_dual_and_b32 v8, 1, v8
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v3, v54, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v3, v32, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
@@ -29252,171 +28555,235 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
 ; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
 ; GFX8-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_writelane_b32 v31, s37, 5
-; GFX8-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX8-NEXT:    v_writelane_b32 v31, s38, 6
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v21
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v18
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
-; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
 ; GFX8-NEXT:    v_writelane_b32 v31, s39, 7
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v17
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v16
-; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
 ; GFX8-NEXT:    v_writelane_b32 v31, s40, 8
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v15
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v14
-; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:76
-; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
 ; GFX8-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
 ; GFX8-NEXT:    v_writelane_b32 v31, s42, 10
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
 ; GFX8-NEXT:    v_writelane_b32 v31, s43, 11
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v13
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v12
-; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80
-; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:16
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v20
-; GFX8-NEXT:    buffer_load_ushort v20, off, s[0:3], s32
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
 ; GFX8-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
 ; GFX8-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v11
 ; GFX8-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v12
 ; GFX8-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v13
 ; GFX8-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v14
 ; GFX8-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v15
 ; GFX8-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v16
 ; GFX8-NEXT:    v_writelane_b32 v31, s51, 19
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v17
 ; GFX8-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v18
 ; GFX8-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v19
 ; GFX8-NEXT:    v_writelane_b32 v31, s54, 22
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX8-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
 ; GFX8-NEXT:    v_writelane_b32 v31, s56, 24
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
 ; GFX8-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
 ; GFX8-NEXT:    v_writelane_b32 v31, s58, 26
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
 ; GFX8-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
 ; GFX8-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
 ; GFX8-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
 ; GFX8-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
 ; GFX8-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
 ; GFX8-NEXT:    v_writelane_b32 v31, s64, 32
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
 ; GFX8-NEXT:    v_writelane_b32 v31, s65, 33
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v8
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v7
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:84
-; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
+; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX8-NEXT:    v_writelane_b32 v31, s66, 34
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_writelane_b32 v31, s67, 35
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v6
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v4
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
-; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v10
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v9
-; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92
-; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
-; GFX8-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX8-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX8-NEXT:    s_waitcnt vmcnt(13)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[64:65]
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v18, v21, s[66:67]
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    s_waitcnt vmcnt(13)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX8-NEXT:    s_waitcnt vmcnt(12)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[60:61]
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[62:63]
-; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
 ; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX8-NEXT:    s_waitcnt vmcnt(13)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[56:57]
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v14, v15, s[58:59]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    s_waitcnt vmcnt(11)
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v12, v13, s[54:55]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v11
-; GFX8-NEXT:    s_waitcnt vmcnt(10)
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v20
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[52:53]
-; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
-; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v22
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v19
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v11
-; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
-; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:108
-; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
-; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:104
-; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
-; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
-; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX8-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
-; GFX8-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v8, s[50:51]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[48:49]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:128
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[66:67]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v33, s[64:65]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e64 v33, v34, v33, s[62:63]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[60:61]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v34, v32, s[58:59]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v34, v27, s[54:55]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v34, v25, s[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v34, v23, s[46:47]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v34, v21, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v34, v19, s[38:39]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v34, v17, s[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v34, v15, s[28:29]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v34, v13, s[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v34, v11, s[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v34, v9, s[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v34, v7, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v34, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v34, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v17
+; GFX8-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
+; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v12, v24, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readlane_b32 s67, v31, 35
 ; GFX8-NEXT:    v_readlane_b32 s66, v31, 34
 ; GFX8-NEXT:    v_readlane_b32 s65, v31, 33
@@ -29427,18 +28794,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_readlane_b32 s60, v31, 28
 ; GFX8-NEXT:    v_readlane_b32 s59, v31, 27
 ; GFX8-NEXT:    v_readlane_b32 s58, v31, 26
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v5, v6, s[46:47]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[44:45]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v10, s[42:43]
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[40:41]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readlane_b32 s57, v31, 25
 ; GFX8-NEXT:    v_readlane_b32 s56, v31, 24
 ; GFX8-NEXT:    v_readlane_b32 s55, v31, 23
@@ -29457,43 +28812,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_readlane_b32 s42, v31, 10
 ; GFX8-NEXT:    v_readlane_b32 s41, v31, 9
 ; GFX8-NEXT:    v_readlane_b32 s40, v31, 8
-; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[36:37]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v22, v23, s[38:39]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v10, v9, s[30:31]
-; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v25, v18, s[34:35]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v24, v16, s[28:29]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v16, v10, s[26:27]
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v11, v21, s[24:25]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[22:23]
-; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v19, v20, s[20:21]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[16:17]
-; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readlane_b32 s39, v31, 7
 ; GFX8-NEXT:    v_readlane_b32 s38, v31, 6
 ; GFX8-NEXT:    v_readlane_b32 s37, v31, 5
@@ -29502,33 +28820,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_cndmask_b32_e64 v19, v12, v18, s[14:15]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v18, s[12:13]
-; GFX8-NEXT:    v_cndmask_b32_e64 v18, v13, v17, s[10:11]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, v14, v16, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_or_b32_sdwa v14, v17, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v15, v20, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[18:19]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_or_b32_sdwa v12, v19, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29544,169 +28835,223 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_writelane_b32 v31, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX9-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX9-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_writelane_b32 v31, s37, 5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
 ; GFX9-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
 ; GFX9-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
 ; GFX9-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
 ; GFX9-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
 ; GFX9-NEXT:    v_writelane_b32 v31, s42, 10
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
 ; GFX9-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
 ; GFX9-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
 ; GFX9-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
 ; GFX9-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
 ; GFX9-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
 ; GFX9-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
 ; GFX9-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
 ; GFX9-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
 ; GFX9-NEXT:    v_writelane_b32 v31, s51, 19
-; GFX9-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX9-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v19
 ; GFX9-NEXT:    v_writelane_b32 v31, s52, 20
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v21
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX9-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
 ; GFX9-NEXT:    v_writelane_b32 v31, s53, 21
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v17
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v16
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v21
 ; GFX9-NEXT:    v_writelane_b32 v31, s54, 22
-; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX9-NEXT:    v_writelane_b32 v31, s55, 23
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v15
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v14
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v23
 ; GFX9-NEXT:    v_writelane_b32 v31, s56, 24
-; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v22
 ; GFX9-NEXT:    v_writelane_b32 v31, s57, 25
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v13
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v12
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v25
 ; GFX9-NEXT:    v_writelane_b32 v31, s58, 26
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v24
 ; GFX9-NEXT:    v_writelane_b32 v31, s59, 27
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v5
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v4
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v20
-; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v27
 ; GFX9-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v26
 ; GFX9-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v29
 ; GFX9-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v28
 ; GFX9-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    v_writelane_b32 v31, s64, 32
 ; GFX9-NEXT:    v_writelane_b32 v31, s65, 33
 ; GFX9-NEXT:    v_writelane_b32 v31, s66, 34
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_writelane_b32 v31, s67, 35
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v30
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v9
-; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
-; GFX9-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX9-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v22
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v19
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v10
-; GFX9-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX9-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX9-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX9-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
-; GFX9-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
-; GFX9-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX9-NEXT:    s_waitcnt vmcnt(12)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v18, v21, s[66:67]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[64:65]
-; GFX9-NEXT:    s_mov_b32 s64, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s64
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[62:63]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[60:61]
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s64
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v14, v15, s[58:59]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v32, v33, s[66:67]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v32, v33, s[64:65]
+; GFX9-NEXT:    v_cndmask_b32_e64 v33, v28, v30, s[62:63]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v30, s[60:61]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v26, v27, s[58:59]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s[54:55]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s[50:51]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s[46:47]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s[42:43]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s[38:39]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[34:35]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s[56:57]
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s64
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v12, v13, s[54:55]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v12, v13, s[28:29]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[52:53]
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    v_perm_b32 v3, v12, v3, s64
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v4, v5, s[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s[24:25]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v8, v9, s[20:21]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[16:17]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v4, v5, s[12:13]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[48:49]
-; GFX9-NEXT:    v_perm_b32 v4, v4, v12, s64
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v20
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v11
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[8:9]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v2, v5, s4
+; GFX9-NEXT:    v_perm_b32 v2, v4, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v6, v9, s4
+; GFX9-NEXT:    v_perm_b32 v4, v8, v11, s4
+; GFX9-NEXT:    v_perm_b32 v5, v10, v13, s4
+; GFX9-NEXT:    v_perm_b32 v6, v12, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v14, v17, s4
+; GFX9-NEXT:    v_perm_b32 v8, v16, v19, s4
+; GFX9-NEXT:    v_perm_b32 v9, v18, v21, s4
+; GFX9-NEXT:    v_perm_b32 v10, v20, v23, s4
+; GFX9-NEXT:    v_perm_b32 v11, v22, v25, s4
+; GFX9-NEXT:    v_perm_b32 v12, v24, v27, s4
+; GFX9-NEXT:    v_perm_b32 v13, v26, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v28, v33, s4
+; GFX9-NEXT:    v_perm_b32 v15, v32, v29, s4
 ; GFX9-NEXT:    v_readlane_b32 s67, v31, 35
 ; GFX9-NEXT:    v_readlane_b32 s66, v31, 34
 ; GFX9-NEXT:    v_readlane_b32 s65, v31, 33
+; GFX9-NEXT:    v_readlane_b32 s64, v31, 32
 ; GFX9-NEXT:    v_readlane_b32 s63, v31, 31
 ; GFX9-NEXT:    v_readlane_b32 s62, v31, 30
 ; GFX9-NEXT:    v_readlane_b32 s61, v31, 29
@@ -29722,54 +29067,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_readlane_b32 s51, v31, 19
 ; GFX9-NEXT:    v_readlane_b32 s50, v31, 18
 ; GFX9-NEXT:    v_readlane_b32 s49, v31, 17
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v7, s[46:47]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[44:45]
-; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s64
 ; GFX9-NEXT:    v_readlane_b32 s48, v31, 16
 ; GFX9-NEXT:    v_readlane_b32 s47, v31, 15
 ; GFX9-NEXT:    v_readlane_b32 s46, v31, 14
 ; GFX9-NEXT:    v_readlane_b32 s45, v31, 13
 ; GFX9-NEXT:    v_readlane_b32 s44, v31, 12
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[42:43]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[40:41]
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s64
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v22, v23, s[38:39]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[36:37]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s64
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v20, v18, s[34:35]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[30:31]
-; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s64
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v16, s[28:29]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[26:27]
-; GFX9-NEXT:    v_perm_b32 v9, v11, v9, s64
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v10, v21, s[24:25]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s[22:23]
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    v_perm_b32 v10, v10, v11, s64
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v19, v24, s[20:21]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[16:17]
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    v_perm_b32 v11, v19, v11, s64
 ; GFX9-NEXT:    v_readlane_b32 s43, v31, 11
 ; GFX9-NEXT:    v_readlane_b32 s42, v31, 10
 ; GFX9-NEXT:    v_readlane_b32 s41, v31, 9
@@ -29782,31 +29084,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v31, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, v12, v18, s[14:15]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v18, s[12:13]
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_cndmask_b32_e64 v18, v13, v17, s[10:11]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v16, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
-; GFX9-NEXT:    v_perm_b32 v14, v14, v17, s64
-; GFX9-NEXT:    v_perm_b32 v12, v12, v19, s64
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v15, v20, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[18:19]
-; GFX9-NEXT:    v_perm_b32 v13, v13, v18, s64
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s64
-; GFX9-NEXT:    v_readlane_b32 s64, v31, 32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29820,205 +29097,208 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX10-NEXT:    s_clause 0x15
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_load_ushort v36, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:84
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:28
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v0
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v2
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v4
-; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v3
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v8
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
-; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v12
-; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v14
-; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v16
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
 ; GFX10-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GFX10-NEXT:    v_and_b32_e32 v26, 1, v26
 ; GFX10-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX10-NEXT:    v_and_b32_e32 v22, 1, v22
 ; GFX10-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX10-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
-; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GFX10-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    s_clause 0x14
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_load_ushort v34, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v30
+; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v28
+; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v26
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v24
+; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v22
+; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v20
+; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v18
+; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v16
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v14
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v12
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v20
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v22
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v24
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v26
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v28
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v30
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v7
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v9
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v17
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v10
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v0
 ; GFX10-NEXT:    v_writelane_b32 v31, s35, 3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v18
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v11
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v13
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v27
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v25
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v23
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v21
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v17
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s26, 1, v15
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v19
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v21
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v23
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v25
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v27
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v29
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v13
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v11
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v7
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
 ; GFX10-NEXT:    s_waitcnt vmcnt(32)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
 ; GFX10-NEXT:    s_waitcnt vmcnt(31)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v33
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v32, v33, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
+; GFX10-NEXT:    s_waitcnt vmcnt(30)
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v34
 ; GFX10-NEXT:    s_waitcnt vmcnt(29)
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v34, v35, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v35
 ; GFX10-NEXT:    s_waitcnt vmcnt(28)
-; GFX10-NEXT:    v_and_b32_e32 v17, 1, v36
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v35
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
-; GFX10-NEXT:    s_waitcnt vmcnt(26)
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, v37, v38, s7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v38
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v35, v36, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v33, v32, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(25)
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v38, v39, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(24)
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, v39, v48, s8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v48
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v39
-; GFX10-NEXT:    s_waitcnt vmcnt(22)
-; GFX10-NEXT:    v_cndmask_b32_e64 v24, v50, v49, s9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v49
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v50
-; GFX10-NEXT:    s_waitcnt vmcnt(20)
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, v51, v52, s10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v52
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v48
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v48, v49, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v49
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v37
 ; GFX10-NEXT:    s_waitcnt vmcnt(18)
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, v53, v54, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v53, v54, s10
+; GFX10-NEXT:    s_waitcnt vmcnt(17)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v55
+; GFX10-NEXT:    s_waitcnt vmcnt(16)
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v55, v64, s9
+; GFX10-NEXT:    s_waitcnt vmcnt(15)
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v65, v37, s8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v65
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v64
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v54
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v53
-; GFX10-NEXT:    s_waitcnt vmcnt(16)
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v55, v64, s12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v64
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v55
-; GFX10-NEXT:    s_waitcnt vmcnt(12)
-; GFX10-NEXT:    v_cndmask_b32_e64 v37, v68, v65, s13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v65
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v68
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v67
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v51, v52, s11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v52
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v51
 ; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v0, v2, s16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v30, v50, s12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v50
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v29, v69, s13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v69
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v8, v69, s17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v69
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v4, v3, s18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v10, v12, s19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v1, v6, s15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v14, v16, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v24, v22, s15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v68, v20, s16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v68
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v67, v18, s17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v28, v26, s14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v66, v16, s18
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v66
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v14, v12, s19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v66, v67, s14
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v15, v13, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, v20, v19, s21
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v23, v22, s22
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v26, v25, s23
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, v29, v28, s24
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, v33, v32, s25
-; GFX10-NEXT:    v_cndmask_b32_e64 v23, v36, v35, s26
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, v39, v38, s27
-; GFX10-NEXT:    v_cndmask_b32_e64 v26, v50, v49, s28
-; GFX10-NEXT:    v_cndmask_b32_e64 v28, v1, v6, s29
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s30
-; GFX10-NEXT:    v_cndmask_b32_e64 v29, v8, v54, s31
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v4, v3, s34
-; GFX10-NEXT:    v_cndmask_b32_e64 v33, v10, v12, s35
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v14, v16, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v7, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v9, v11, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v13, v18, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v15, v21, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v4, v19, v24, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v5, v20, v27, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v6, v22, v30, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v7, v23, v34, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v8, v25, v37, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v9, v26, v48, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v10, v28, v51, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v11, v17, v52, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v29, v53, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v13, v32, v55, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v14, v33, v64, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v15, v16, v65, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v66, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v6, v5, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v68, v8, v7, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v10, v9, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v25, v23, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v33, v32, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v36, v35, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v30, v38, s26
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v29, v48, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v28, v26, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v52, v20, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v14, v12, s31
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v16, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v54, v18, s34
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v24, v22, s35
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v4, v3, s4
+; GFX10-NEXT:    v_perm_b32 v0, v0, v65, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v55, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v2, v53, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v20, v51, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v12, v50, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v5, v49, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v6, v39, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v7, v37, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v8, v34, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v9, v27, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v10, v21, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v69, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v68, v19, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v66, v17, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
 ; GFX10-NEXT:    v_readlane_b32 s35, v31, 3
 ; GFX10-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX10-NEXT:    v_readlane_b32 s31, v31, 1
@@ -30035,205 +29315,198 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x20
 ; GFX11-NEXT:    scratch_load_u16 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:64
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:4
+; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v35, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, v36, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, v38, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, v48, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, v50, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
 ; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
-; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, v52, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v55, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v54, v55, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v64, v65 :: v_dual_and_b32 v11, 1, v11
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
 ; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v66, v67, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v66, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v68, v69, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v68, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v70, v71, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v70, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v80, v81, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v80, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v82, v83, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v82, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v31, 1, v31
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v84, v85, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v84, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v86, v87, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v86, v87, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v84, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v86, v87, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v36, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v38, v39, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v50, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v52, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v54, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v64, v65, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v66, v67, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v68, v69, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v70, v71, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
 ; GFX11-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v80, v81, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v82, v83, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
 ; GFX11-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v84, v85, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v14, v29, v28, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v86, v87, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v15, v31, v30, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
@@ -30446,98 +29719,61 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-LABEL: v_fma_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_fma_f32 v3, v6, v5, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_fma_f32 v2, v6, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT:    v_fma_f32 v3, v6, v5, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX9-NEXT:    v_fma_f32 v2, v6, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fma_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v5, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_fmac_f32 v6, v8, v7
-; GFX11-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -30596,97 +29832,97 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-LABEL: v_fma_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_fma_f32 v2, v6, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX9-NEXT:    v_fma_f32 v2, v6, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v5, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_lshlrev_b32 v6, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-NEXT:    v_dual_fmac_f32 v4, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v5, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_fmac_f32 v6, v8, v7
-; GFX11-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v9, 0x7060302
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX11-NEXT:    v_perm_b32 v1, v5, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
@@ -30948,133 +30184,79 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-LABEL: v_fmuladd_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fmuladd_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v7, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f32_e32 v6, v8, v7
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v7 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -31149,132 +30331,131 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-LABEL: v_fmuladd_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_mul_f32_e32 v3, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX10-NEXT:    v_add_f32_e32 v2, v6, v2
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v7, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f32_e32 v6, v8, v7
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v2, v6, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v3, v8, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v7 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index ef2be37d62d9d..160bae51193ea 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -3165,11 +3165,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -3178,16 +3178,14 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
@@ -3205,11 +3203,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -3219,16 +3217,14 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
@@ -3236,11 +3232,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:16
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
@@ -3260,20 +3256,17 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v35, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b32 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b64 v[32:33], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index ca3336beccf77..8dcd0f504e509 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -948,57 +948,52 @@ define <3 x i1> @isnan_v3bf16(<3 x bfloat> %x) nounwind {
 ; GFX8CHECK-LABEL: isnan_v3bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff, v1
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX8CHECK-NEXT:    v_bfe_u32 v1, v0, 16, 15
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v2
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v3bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_sdwa s[4:5], v3, s4 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v3bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v0
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v3, 0x7f80
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_sdwa s4, v2, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v3bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
-; GFX11CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2
@@ -1032,18 +1027,17 @@ define <4 x i1> @isnan_v4bf16(<4 x bfloat> %x) nounwind {
 ; GFX8CHECK-LABEL: isnan_v4bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX8CHECK-NEXT:    v_bfe_u32 v3, v1, 16, 15
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX8CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v1
+; GFX8CHECK-NEXT:    v_bfe_u32 v1, v0, 16, 15
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v4
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v2
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1051,56 +1045,49 @@ define <4 x i1> @isnan_v4bf16(<4 x bfloat> %x) nounwind {
 ; GFX9CHECK-LABEL: isnan_v4bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    s_movk_i32 s5, 0x7f80
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v4, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v3, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v1
+; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7f80
+; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v1
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v1
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v3
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_sdwa s[4:5], v3, s6 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v4
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v4bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
+; GFX10CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v5, 0x7f80
+; GFX10CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_sdwa s4, v3, v5 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_sdwa s4, v4, v5 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v5
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v4bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX11CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v2
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> %x, i32 3)  ; nan
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 8ca33dce73f88..5b9866a3c9157 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -303,9 +303,8 @@ ret:
   ret void
 }
 
-; FIXME: This shouldn't have the 0 initialization
 ; GCN-LABEL: {{^}}undef_v3bf16:
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; GCN: s_cbranch_vccnz
 define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index dfe98bbbaddf9..d76bb48b4a82a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -2703,30 +2703,30 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_234u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_234u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_234u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -2935,32 +2935,35 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_357u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_357u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_357u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2972,24 +2975,29 @@ define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0101:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0101:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0101:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3029,31 +3037,39 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0145:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0145:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0145:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3065,30 +3081,33 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0167:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0167:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3102,7 +3121,9 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3111,6 +3132,7 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3119,6 +3141,7 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3161,30 +3184,33 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_2345:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2345:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2345:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3233,33 +3259,39 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_4501:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4501:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4501:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[3:4], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v3, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3271,32 +3303,33 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_4523:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4523:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4523:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3309,24 +3342,29 @@ define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_4545:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4545:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4545:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3366,32 +3404,33 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_6701:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6701:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6701:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v4, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3443,7 +3482,9 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3452,6 +3493,7 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3460,6 +3502,7 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3578,31 +3621,34 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_3456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_3456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3693,7 +3739,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0000:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
@@ -3703,7 +3749,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-LABEL: shuffle_v4bf16_0000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -3712,7 +3758,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-LABEL: shuffle_v4bf16_0000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -3728,7 +3774,7 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_1010:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
@@ -3737,7 +3783,7 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-LABEL: shuffle_v4bf16_1010:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -3746,7 +3792,7 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-LABEL: shuffle_v4bf16_1010:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -3762,7 +3808,7 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_1100:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -3773,7 +3819,7 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-LABEL: shuffle_v4bf16_1100:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
@@ -3782,7 +3828,7 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-LABEL: shuffle_v4bf16_1100:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
@@ -3897,24 +3943,29 @@ define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v8bf16_0101:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_0101:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_0101:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
@@ -3954,31 +4005,39 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v8bf16_4589:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_4589:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:8
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_4589:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:8
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
@@ -4067,9 +4126,11 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3bf16_0122:
@@ -4077,6 +4138,7 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4085,6 +4147,7 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
@@ -4190,27 +4253,27 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_fma_f32 v7, v8, v9, v7
 ; GFX9-NEXT:    v_fma_f32 v0, v8, v4, v0
-; GFX9-NEXT:    v_fma_f32 v4, v11, v4, v12
-; GFX9-NEXT:    v_fma_f32 v1, v11, v9, v1
+; GFX9-NEXT:    v_fma_f32 v8, v12, v9, v11
+; GFX9-NEXT:    v_fma_f32 v1, v12, v4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
 ; GFX9-NEXT:    v_fma_f32 v0, v2, v10, v0
-; GFX9-NEXT:    v_fma_f32 v2, v2, v5, v7
-; GFX9-NEXT:    v_fma_f32 v1, v3, v5, v1
-; GFX9-NEXT:    v_fma_f32 v3, v3, v10, v4
+; GFX9-NEXT:    v_fma_f32 v2, v2, v5, v4
+; GFX9-NEXT:    v_fma_f32 v1, v3, v10, v1
+; GFX9-NEXT:    v_fma_f32 v3, v3, v5, v7
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s0
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4233,27 +4296,27 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v9
 ; GFX10-NEXT:    v_fmac_f32_e32 v0, v8, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v9
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v10, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v10, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v10
-; GFX10-NEXT:    v_fmac_f32_e32 v4, v2, v5
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v5
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v10
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v7, v1, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v3, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v5
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -4269,36 +4332,36 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v11, v4 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v10, v9
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v3, v10 :: v_dual_fmac_f32 v0, v8, v4
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v3, v5 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_fmac_f32 v11, v10, v4 :: v_dual_lshlrev_b32 v8, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_fmac_f32 v7, v8, v9 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_fmac_f32_e32 v0, v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v10
-; GFX11-NEXT:    v_fmac_f32_e32 v12, v11, v9
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v4, v2, v5 :: v_dual_and_b32 v7, 0xffff0000, v12
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v3, v5
+; GFX11-NEXT:    v_dual_fmac_f32 v4, v3, v12 :: v_dual_fmac_f32 v7, v2, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x7060302
+; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v12
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4336,28 +4399,32 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x5040100
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
+; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100

From 597086c60959dd5b3c032552e8b42dd1d053f233 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 5 Jan 2024 08:44:19 +0700
Subject: [PATCH 302/313] DAG: Implement promotion for strict_fp_round (#74332)

Needs an AMDGPU hack to get the selection to work. The ordinary
variant is custom lowered through an almost equivalent target node
that would need a strict variant for additional known bits
optimizations.
---
 .../include/llvm/Target/TargetSelectionDAG.td | 13 ++++
 .../SelectionDAG/LegalizeFloatTypes.cpp       | 46 ++++++++++++-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 15 ++++-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  9 ++-
 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll   | 67 +++++++++++++++++++
 6 files changed, 149 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index d4fc9d8a96db2..22360353790db 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -614,6 +614,12 @@ def strict_sint_to_fp : SDNode<"ISD::STRICT_SINT_TO_FP",
                                SDTIntToFPOp, [SDNPHasChain]>;
 def strict_uint_to_fp : SDNode<"ISD::STRICT_UINT_TO_FP",
                                SDTIntToFPOp, [SDNPHasChain]>;
+
+def strict_f16_to_fp  : SDNode<"ISD::STRICT_FP16_TO_FP",
+                               SDTIntToFPOp, [SDNPHasChain]>;
+def strict_fp_to_f16  : SDNode<"ISD::STRICT_FP_TO_FP16",
+                               SDTFPToIntOp, [SDNPHasChain]>;
+
 def strict_fsetcc  : SDNode<"ISD::STRICT_FSETCC",  SDTSetCC, [SDNPHasChain]>;
 def strict_fsetccs : SDNode<"ISD::STRICT_FSETCCS", SDTSetCC, [SDNPHasChain]>;
 
@@ -1576,6 +1582,13 @@ def any_fsetccs : PatFrags<(ops node:$lhs, node:$rhs, node:$pred),
                           [(strict_fsetccs node:$lhs, node:$rhs, node:$pred),
                            (setcc node:$lhs, node:$rhs, node:$pred)]>;
 
+def any_f16_to_fp : PatFrags<(ops node:$src),
+                              [(f16_to_fp node:$src),
+                               (strict_f16_to_fp node:$src)]>;
+def any_fp_to_f16 : PatFrags<(ops node:$src),
+                              [(fp_to_f16 node:$src),
+                               (strict_fp_to_f16 node:$src)]>;
+
 multiclass binary_atomic_op_ord {
   def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 65919a64b8065..6e0e1e23419be 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2181,6 +2181,24 @@ static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) {
   report_fatal_error("Attempt at an invalid promotion-related conversion");
 }
 
+static ISD::NodeType GetPromotionOpcodeStrict(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f16)
+    return ISD::STRICT_FP16_TO_FP;
+
+  if (RetVT == MVT::f16)
+    return ISD::STRICT_FP_TO_FP16;
+
+  if (OpVT == MVT::bf16) {
+    // TODO: return ISD::STRICT_BF16_TO_FP;
+  }
+
+  if (RetVT == MVT::bf16) {
+    // TODO: return ISD::STRICT_FP_TO_BF16;
+  }
+
+  report_fatal_error("Attempt at an invalid promotion-related conversion");
+}
+
 bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Promote float operand " << OpNo << ": "; N->dump(&DAG));
   SDValue R = SDValue();
@@ -2281,7 +2299,7 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
 
 SDValue DAGTypeLegalizer::PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N,
                                                           unsigned OpNo) {
-  assert(OpNo == 1);
+  assert(OpNo == 1 && "Promoting unpromotable operand");
 
   SDValue Op = GetPromotedFloat(N->getOperand(1));
   EVT VT = N->getValueType(0);
@@ -2416,6 +2434,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FFREXP:     R = PromoteFloatRes_FFREXP(N); break;
 
     case ISD::FP_ROUND:   R = PromoteFloatRes_FP_ROUND(N); break;
+    case ISD::STRICT_FP_ROUND:
+      R = PromoteFloatRes_STRICT_FP_ROUND(N);
+      break;
     case ISD::LOAD:       R = PromoteFloatRes_LOAD(N); break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
@@ -2621,6 +2642,29 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) {
   return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round);
 }
 
+// Explicit operation to reduce precision.  Reduce the value to half precision
+// and promote it back to the legal type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_STRICT_FP_ROUND(SDNode *N) {
+  SDLoc DL(N);
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = Op->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  // Round promoted float to desired precision
+  SDValue Round = DAG.getNode(GetPromotionOpcodeStrict(OpVT, VT), DL,
+                              DAG.getVTList(IVT, MVT::Other), Chain, Op);
+  // Promote it back to the legal output type
+  SDValue Res =
+      DAG.getNode(GetPromotionOpcodeStrict(VT, NVT), DL,
+                  DAG.getVTList(NVT, MVT::Other), Round.getValue(1), Round);
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 3d21bd22e6ef5..92598d8856193 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -165,7 +165,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_FP16:
     Res = PromoteIntRes_FP_TO_FP16_BF16(N);
     break;
-
+  case ISD::STRICT_FP_TO_FP16:
+    Res = PromoteIntRes_STRICT_FP_TO_FP16_BF16(N);
+    break;
   case ISD::GET_ROUNDING: Res = PromoteIntRes_GET_ROUNDING(N); break;
 
   case ISD::AND:
@@ -787,6 +789,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16_BF16(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_STRICT_FP_TO_FP16_BF16(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(NVT, MVT::Other),
+                            N->getOperand(0), N->getOperand(1));
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_XRINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
@@ -1804,6 +1816,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP16_TO_FP:
   case ISD::VP_UINT_TO_FP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
+  case ISD::STRICT_FP16_TO_FP:
   case ISD::STRICT_UINT_TO_FP:  Res = PromoteIntOp_STRICT_UINT_TO_FP(N); break;
   case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
   case ISD::VP_ZERO_EXTEND: Res = PromoteIntOp_VP_ZERO_EXTEND(N); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 84b1b2c71fd0b..09f0bca8b8611 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -326,6 +326,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT_SAT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_FP16_BF16(SDNode *N);
+  SDValue PromoteIntRes_STRICT_FP_TO_FP16_BF16(SDNode *N);
   SDValue PromoteIntRes_XRINT(SDNode *N);
   SDValue PromoteIntRes_FREEZE(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
@@ -699,6 +700,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteFloatRes_ExpOp(SDNode *N);
   SDValue PromoteFloatRes_FFREXP(SDNode *N);
   SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
+  SDValue PromoteFloatRes_STRICT_FP_ROUND(SDNode *N);
   SDValue PromoteFloatRes_LOAD(SDNode *N);
   SDValue PromoteFloatRes_SELECT(SDNode *N);
   SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8c4d805dbba9..1158d16baa1ba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1097,7 +1097,7 @@ def : Pat <
 multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
   // f16_to_fp patterns
   def : GCNPat <
-    (f32 (f16_to_fp i32:$src0)),
+    (f32 (any_f16_to_fp i32:$src0)),
     (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0)
   >;
 
@@ -1151,6 +1151,13 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
     (f16 (uint_to_fp i32:$src)),
     (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
   >;
+
+  // This is only used on targets without half support
+  // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
+  def : GCNPat <
+    (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+    (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
+  >;
 }
 
 let SubtargetPredicate = NotHasTrue16BitInsts in
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
index 0339fca4d56cf..8a3647a9b6e93 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
@@ -3,6 +3,9 @@
 
 declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
 declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
+declare float @llvm.fabs.f32(float)
 
 define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 {
 ; GFX7-LABEL: v_constrained_fpext_f16_to_f32:
@@ -40,4 +43,68 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0
   ret <2 x float> %result
 }
 
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
+define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x half> %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %neg.arg = fneg float %arg
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %abs.arg = call float @llvm.fabs.f32(float %arg)
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
 attributes #0 = { strictfp }

From 2cf420d5b846a4733ef0ef7c8ed0ae0bfd1c6772 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 2 Jan 2024 10:55:02 +0800
Subject: [PATCH 303/313] [LoongArch] Emit function call code sequence as
 `PCADDU18I+JIRL` in medium code model

According to the description of the psABI v2.20:
https://github.com/loongson/la-abi-specs/releases/tag/v2.20, adjustments
are made to the function call instructions under the medium code model.

At the same time, AsmParser has already supported parsing the call36 and
tail36 macro instructions.
---
 .../AsmParser/LoongArchAsmParser.cpp          | 61 +++++++++++++++++++
 .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 29 ++++-----
 .../Target/LoongArch/LoongArchInstrInfo.td    | 23 ++++++-
 .../Target/LoongArch/LoongArchMCInstLower.cpp |  3 +
 .../LoongArch/LoongArchTargetMachine.cpp      |  4 +-
 .../MCTargetDesc/LoongArchBaseInfo.h          |  1 +
 .../MCTargetDesc/LoongArchELFObjectWriter.cpp |  2 +
 .../MCTargetDesc/LoongArchFixupKinds.h        |  5 +-
 .../MCTargetDesc/LoongArchMCCodeEmitter.cpp   |  3 +
 .../MCTargetDesc/LoongArchMCExpr.cpp          |  3 +
 .../LoongArch/MCTargetDesc/LoongArchMCExpr.h  |  1 +
 llvm/test/CodeGen/LoongArch/code-models.ll    | 12 ++--
 .../MC/LoongArch/Basic/Integer/invalid64.s    |  2 +-
 llvm/test/MC/LoongArch/Macros/macros-call.s   |  9 +++
 .../MC/LoongArch/Relocations/relocations.s    |  5 ++
 15 files changed, 134 insertions(+), 29 deletions(-)
 create mode 100644 llvm/test/MC/LoongArch/Macros/macros-call.s

diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 66a37fce5dda1..46f63a4103f9f 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -121,6 +121,10 @@ class LoongArchAsmParser : public MCTargetAsmParser {
   // Helper to emit pseudo instruction "li.w/d $rd, $imm".
   void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
 
+  // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym".
+  void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                      bool IsTailCall);
+
 public:
   enum LoongArchMatchResultTy {
     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -400,6 +404,22 @@ class LoongArchOperand : public MCParsedAsmOperand {
                      IsValidKind;
   }
 
+  bool isSImm20pcaddu18i() const {
+    if (!isImm())
+      return false;
+
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
+                       VK == LoongArchMCExpr::VK_LoongArch_CALL36;
+
+    return IsConstantImm
+               ? isInt<20>(Imm) && IsValidKind
+               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+                     IsValidKind;
+  }
+
   bool isSImm21lsl2() const {
     if (!isImm())
       return false;
@@ -1110,6 +1130,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
+void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
+                                        MCStreamer &Out, bool IsTailCall) {
+  // call36 sym
+  // expands to:
+  //   pcaddu18i $ra, %call36(sym)
+  //   jirl      $ra, $ra, 0
+  //
+  // tail36 $rj, sym
+  // expands to:
+  //   pcaddu18i $rj, %call36(sym)
+  //   jirl      $r0, $rj, 0
+  unsigned ScratchReg =
+      IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1;
+  const MCExpr *Sym =
+      IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
+  const LoongArchMCExpr *LE = LoongArchMCExpr::create(
+      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
+
+  Out.emitInstruction(
+      MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
+      getSTI());
+  Out.emitInstruction(
+      MCInstBuilder(LoongArch::JIRL)
+          .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg)
+          .addReg(ScratchReg)
+          .addImm(0),
+      getSTI());
+}
+
 bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                             OperandVector &Operands,
                                             MCStreamer &Out) {
@@ -1158,6 +1207,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   case LoongArch::PseudoLI_D:
     emitLoadImm(Inst, IDLoc, Out);
     return false;
+  case LoongArch::PseudoCALL36:
+    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false);
+    return false;
+  case LoongArch::PseudoTAIL36:
+    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true);
+    return false;
   }
   Out.emitInstruction(Inst, getSTI());
   return false;
@@ -1439,6 +1494,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         /*Upper=*/(1 << 19) - 1,
         "operand must be a symbol with modifier (e.g. %pc_hi20) or an integer "
         "in the range");
+  case Match_InvalidSImm20pcaddu18i:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+        /*Upper=*/(1 << 19) - 1,
+        "operand must be a symbol with modifier (e.g. %call36) or an integer "
+        "in the range");
   case Match_InvalidSImm21lsl2:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 72c1f1cec1983..8eda2dcc1633e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -458,11 +458,11 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
   }
   case CodeModel::Medium: {
     // CALL:
-    // pcalau12i  $ra, %pc_hi20(func)
-    // jirl       $ra, $ra, %pc_lo12(func)
+    // pcaddu18i $ra, %call36(func)
+    // jirl      $ra, $ra, 0
     // TAIL:
-    // pcalau12i  $scratch, %pc_hi20(func)
-    // jirl       $r0, $scratch, %pc_lo12(func)
+    // pcaddu18i $scratch, %call36(func)
+    // jirl      $r0, $scratch, 0
     Opcode =
         IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
     Register ScratchReg =
@@ -470,18 +470,15 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
             ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
             : LoongArch::R1;
     MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg);
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg);
-    if (Func.isSymbol()) {
-      const char *FnName = Func.getSymbolName();
-      MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI);
-      CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO);
-      break;
-    }
-    assert(Func.isGlobal() && "Expected a GlobalValue at this time");
-    const GlobalValue *GV = Func.getGlobal();
-    MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI);
-    CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO);
+        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+
+    CALL =
+        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+
+    if (Func.isSymbol())
+      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+    else
+      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
     break;
   }
   case CodeModel::Large: {
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 2fea0f33e9eb4..21555f8856300 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -377,6 +377,10 @@ def simm20_lu32id : SImm20Operand {
   let ParserMatchClass = SImmAsmOperand<20, "lu32id">;
 }
 
+def simm20_pcaddu18i : SImm20Operand {
+  let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
+}
+
 def simm21_lsl2 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
   let EncoderMethod = "getImmOpValueAsr<2>";
@@ -832,7 +836,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),
                        "$rd, $imm20">;
 }
 def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>;
-def PCADDU18I : ALU_1RI20<0x1e000000, simm20>;
+def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;
 def MUL_D     : ALU_3R<0x001d8000>;
 def MULH_D    : ALU_3R<0x001e0000>;
 def MULH_DU   : ALU_3R<0x001e8000>;
@@ -1396,7 +1400,7 @@ def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
 
 let isCall = 1, Defs = [R1] in
-def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>;
+def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
 
 def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
@@ -1416,7 +1420,7 @@ def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
-def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>;
+def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
 def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
           (PseudoTAIL tglobaladdr:$dst)>;
@@ -1439,6 +1443,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
                       PseudoInstExpansion<(JIRL R0, GPR:$rj,
                                            simm16_lsl2:$imm16)>;
 
+/// call36/taill36 macro instructions
+let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1,
+    Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
+def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [],
+                          "call36", "$dst">,
+                   Requires<[IsLA64]>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3],
+    isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0,
+    mayStore = 0, mayLoad = 0 in
+def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [],
+                          "tail36", "$tmp, $dst">,
+                   Requires<[IsLA64]>;
+
 /// Load address (la*) macro instructions.
 
 // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser.
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index 5daa9481c9072..98ad49f25e3f2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   case LoongArchII::MO_GD_PC_HI:
     Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20;
     break;
+  case LoongArchII::MO_CALL36:
+    Kind = LoongArchMCExpr::VK_LoongArch_CALL36;
+    break;
     // TODO: Handle more target-flags.
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index a5a4d78aceeef..62ae1dea00d6f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT,
 
   switch (*CM) {
   case CodeModel::Small:
-  case CodeModel::Medium:
     return *CM;
+  case CodeModel::Medium:
   case CodeModel::Large:
     if (!TT.isArch64Bit())
-      report_fatal_error("Large code model requires LA64");
+      report_fatal_error("Medium/Large code model requires LA64");
     return *CM;
   default:
     report_fatal_error(
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index cee6dad1f095e..0692cb92b6944 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -47,6 +47,7 @@ enum {
   MO_IE_PC64_HI,
   MO_LD_PC_HI,
   MO_GD_PC_HI,
+  MO_CALL36
   // TODO: Add more flags.
 };
 } // end namespace LoongArchII
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index fe19a4f2d3c86..1dec816f34733 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_LARCH_TLS_LE64_LO20;
   case LoongArch::fixup_loongarch_tls_le64_hi12:
     return ELF::R_LARCH_TLS_LE64_HI12;
+  case LoongArch::fixup_loongarch_call36:
+    return ELF::R_LARCH_CALL36;
     // TODO: Handle more fixup-kinds.
   }
 }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index 178fa6e5262be..e827bae1f3e37 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -108,7 +108,10 @@ enum Fixups {
   // 20-bit fixup corresponding to %gd_hi20(foo) for instruction lu12i.w.
   fixup_loongarch_tls_gd_hi20,
   // Generate an R_LARCH_RELAX which indicates the linker may relax here.
-  fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX
+  fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
+  // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
+  // pcaddu18i+jirl.
+  fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
 };
 } // end namespace LoongArch
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index d2ea062dc09a7..9ac0128f25172 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20:
       FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20;
       break;
+    case LoongArchMCExpr::VK_LoongArch_CALL36:
+      FixupKind = LoongArch::fixup_loongarch_call36;
+      break;
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 82c992b1cc8c4..8ca8876a19b93 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
     return "gd_pc_hi20";
   case VK_LoongArch_TLS_GD_HI20:
     return "gd_hi20";
+  case VK_LoongArch_CALL36:
+    return "call36";
   }
 }
 
@@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
       .Case("ld_hi20", VK_LoongArch_TLS_LD_HI20)
       .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20)
       .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20)
+      .Case("call36", VK_LoongArch_CALL36)
       .Default(VK_LoongArch_Invalid);
 }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 93251f8241033..bd828116d7fa4 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -61,6 +61,7 @@ class LoongArchMCExpr : public MCTargetExpr {
     VK_LoongArch_TLS_LD_HI20,
     VK_LoongArch_TLS_GD_PC_HI20,
     VK_LoongArch_TLS_GD_HI20,
+    VK_LoongArch_CALL36,
     VK_LoongArch_Invalid // Must be the last item.
   };
 
diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
index c610f645a06ae..7c6f46d5e9262 100644
--- a/llvm/test/CodeGen/LoongArch/code-models.ll
+++ b/llvm/test/CodeGen/LoongArch/code-models.ll
@@ -23,8 +23,8 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; MEDIUM:       # %bb.0:
 ; MEDIUM-NEXT:    addi.d $sp, $sp, -16
 ; MEDIUM-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(callee)
-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(callee)
+; MEDIUM-NEXT:    pcaddu18i $ra, %call36(callee)
+; MEDIUM-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM-NEXT:    ret
@@ -68,8 +68,8 @@ define void @call_external_sym(ptr %dst) {
 ; MEDIUM-NEXT:    .cfi_offset 1, -8
 ; MEDIUM-NEXT:    ori $a2, $zero, 1000
 ; MEDIUM-NEXT:    move $a1, $zero
-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(memset)
-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(memset)
+; MEDIUM-NEXT:    pcaddu18i $ra, %call36(memset)
+; MEDIUM-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM-NEXT:    ret
@@ -105,8 +105,8 @@ define i32 @caller_tail(i32 %i) nounwind {
 ;
 ; MEDIUM-LABEL: caller_tail:
 ; MEDIUM:       # %bb.0: # %entry
-; MEDIUM-NEXT:    pcalau12i $a1, %pc_hi20(callee_tail)
-; MEDIUM-NEXT:    jirl $zero, $a1, %pc_lo12(callee_tail)
+; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
+; MEDIUM-NEXT:    jr $a1
 ;
 ; LARGE-LABEL: caller_tail:
 ; LARGE:       # %bb.0: # %entry
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
index acddca9432a69..1c1c658ad440f 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
@@ -65,7 +65,7 @@ addu16i.d $a0, $a0, 32768
 
 ## simm20
 pcaddu18i $a0, 0x80000
-# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287]
+# CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %call36) or an integer in the range [-524288, 524287]
 
 ## simm20_lu32id
 lu32i.d $a0, 0x80000
diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s
new file mode 100644
index 0000000000000..a648a39780381
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Macros/macros-call.s
@@ -0,0 +1,9 @@
+# RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
+
+call36 sym_call
+# CHECK:      pcaddu18i $ra, %call36(sym_call)
+# CHECK-NEXT: jirl $ra, $ra, 0
+
+tail36 $t0, sym_tail
+# CHECK:      pcaddu18i $t0, %call36(sym_tail)
+# CHECK-NEXT: jr $t0
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index 042cc93470a1e..bec71e1038933 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -218,3 +218,8 @@ lu12i.w $t1, %gd_hi20(foo)
 # RELOC: R_LARCH_TLS_GD_HI20 foo 0x0
 # INSTR: lu12i.w $t1, %gd_hi20(foo)
 # FIXUP: fixup A - offset: 0, value: %gd_hi20(foo), kind: FK_NONE
+
+pcaddu18i $t1, %call36(foo)
+# RELOC: R_LARCH_CALL36 foo 0x0
+# INSTR: pcaddu18i $t1, %call36(foo)
+# FIXUP: fixup A - offset: 0, value: %call36(foo), kind: FK_NONE

From 3d6fc35b9071009c5ef37f879a12982c6a54db60 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 2 Jan 2024 10:57:40 +0800
Subject: [PATCH 304/313] [LoongArch] Pre-commit test for #76555. NFC

---
 .../LoongArch/psabi-restricted-scheduling.ll  | 172 ++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll

diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
new file mode 100644
index 0000000000000..150a935d7bf82
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=0 < %s \
+; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=1 < %s \
+; RUN:     | FileCheck %s --check-prefix=MEDIUM_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=0 < %s \
+; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \
+; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
+
+;; FIXME: According to the description of the psABI v2.30, the code sequences
+;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must
+;; be adjacent.
+
+@g = dso_local global i64 zeroinitializer, align 4
+@G = global i64 zeroinitializer, align 4
+@gd = external thread_local global i64
+@ld = external thread_local(localdynamic) global i64
+@ie = external thread_local(initialexec) global i64
+
+declare ptr @bar(i64)
+
+define void @foo() nounwind {
+; MEDIUM_NO_SCH-LABEL: foo:
+; MEDIUM_NO_SCH:       # %bb.0:
+; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUM_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
+; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUM_NO_SCH-NEXT:    ret
+;
+; MEDIUM_SCH-LABEL: foo:
+; MEDIUM_SCH:       # %bb.0:
+; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
+; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUM_SCH-NEXT:    ret
+;
+; LARGE_NO_SCH-LABEL: foo:
+; LARGE_NO_SCH:       # %bb.0:
+; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGE_NO_SCH-NEXT:    ret
+;
+; LARGE_SCH-LABEL: foo:
+; LARGE_SCH:       # %bb.0:
+; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE_SCH-NEXT:    ori $a0, $zero, 1
+; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGE_SCH-NEXT:    ret
+  %V = load volatile i64, ptr @G
+  %v = load volatile i64, ptr @g
+  call void @bar(i64 1)
+  %v_gd = load volatile i64, ptr @gd
+  %v_ld = load volatile i64, ptr @ld
+  %v_ie = load volatile i64, ptr @ie
+  ret void
+}

From c56a5e895a96fec4292e9333d998cfa88770432a Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 2 Jan 2024 10:57:15 +0800
Subject: [PATCH 305/313] [LoongArch] Reimplement the expansion of
 PseudoLA*_LARGE instructions (#76555)

According to the description of the psABI v2.30:
https://github.com/loongson/la-abi-specs/releases/tag/v2.30, moved the
expansion of relevant pseudo-instructions from
`LoongArchPreRAExpandPseudo` pass to `LoongArchExpandPseudo` pass, to
ensure that the code sequences of `PseudoLA*_LARGE` instructions and
Medium code model's function call are not scheduled.
---
 .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 519 +++++++++---------
 .../LoongArch/LoongArchISelLowering.cpp       |  24 +-
 .../Target/LoongArch/LoongArchISelLowering.h  |   4 +
 .../Target/LoongArch/LoongArchInstrInfo.td    |  83 ++-
 llvm/test/CodeGen/LoongArch/code-models.ll    |  36 +-
 llvm/test/CodeGen/LoongArch/expand-call.ll    |   2 +-
 llvm/test/CodeGen/LoongArch/global-address.ll |  32 +-
 .../LoongArch/psabi-restricted-scheduling.ll  | 102 ++--
 llvm/test/CodeGen/LoongArch/tls-models.ll     |  68 +--
 9 files changed, 487 insertions(+), 383 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 8eda2dcc1633e..f977f176066a5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -62,43 +62,24 @@ class LoongArchPreRAExpandPseudo : public MachineFunctionPass {
                                MachineBasicBlock::iterator &NextMBBI,
                                unsigned FlagsHi, unsigned SecondOpcode,
                                unsigned FlagsLo);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO,
-                              const MachineOperand &Symbol, Register DestReg,
-                              bool EraseFromParent);
   bool expandLoadAddressPcrel(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressGot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            MachineBasicBlock::iterator &NextMBBI,
-                            bool Large = false);
+                            MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
-  bool expandFunctionCALL(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          MachineBasicBlock::iterator &NextMBBI,
-                          bool IsTailCall);
+                              MachineBasicBlock::iterator &NextMBBI);
 };
 
 char LoongArchPreRAExpandPseudo::ID = 0;
@@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI(
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoLA_PCREL:
     return expandLoadAddressPcrel(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_PCREL_LARGE:
-    return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_GOT:
     return expandLoadAddressGot(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_GOT_LARGE:
-    return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LE:
     return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);
   case LoongArch::PseudoLA_TLS_IE:
     return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_IE_LARGE:
-    return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LD:
     return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_LD_LARGE:
-    return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_GD:
     return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_GD_LARGE:
-    return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true);
-  case LoongArch::PseudoCALL:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
-  case LoongArch::PseudoTAIL:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
   return false;
 }
@@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
   return true;
 }
 
-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO) {
-  MachineInstr &MI = *MBBI;
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
-                                MI.getOperand(2), MI.getOperand(0).getReg(),
-                                true);
-}
-
-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
-    bool EraseFromParent) {
-  // Code Sequence:
-  //
-  // Part1: pcalau12i  $scratch, %MO1(sym)
-  // Part0: addi.d     $dest, $zero, %MO0(sym)
-  // Part2: lu32i.d    $dest, %MO2(sym)
-  // Part3: lu52i.d    $dest, $dest, %MO3(sym)
-  // Fin:   LastOpcode $dest, $dest, $scratch
-
-  unsigned MO0, MO1, MO2, MO3;
-  switch (IdentifyingMO) {
-  default:
-    llvm_unreachable("unsupported identifying MO");
-  case LoongArchII::MO_PCREL_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_PCREL_HI;
-    MO2 = LoongArchII::MO_PCREL64_LO;
-    MO3 = LoongArchII::MO_PCREL64_HI;
-    break;
-  case LoongArchII::MO_GOT_PC_HI:
-  case LoongArchII::MO_LD_PC_HI:
-  case LoongArchII::MO_GD_PC_HI:
-    // These cases relocate just like the GOT case, except for Part1.
-    MO0 = LoongArchII::MO_GOT_PC_LO;
-    MO1 = IdentifyingMO;
-    MO2 = LoongArchII::MO_GOT_PC64_LO;
-    MO3 = LoongArchII::MO_GOT_PC64_HI;
-    break;
-  case LoongArchII::MO_IE_PC_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_IE_PC_HI;
-    MO2 = LoongArchII::MO_IE_PC64_LO;
-    MO3 = LoongArchII::MO_IE_PC64_HI;
-    break;
-  }
-
-  MachineFunction *MF = MBB.getParent();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-
-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
-         "Large code model requires LA64");
-
-  Register TmpPart1 =
-      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
-  Register TmpPart0 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-  Register TmpParts02 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-  Register TmpParts023 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-
-  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1);
-  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0)
-                   .addReg(LoongArch::R0);
-  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02)
-                   // "rj" is needed due to InstrInfo pattern requirement.
-                   .addReg(TmpPart0, RegState::Kill);
-  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023)
-                   .addReg(TmpParts02, RegState::Kill);
-  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
-      .addReg(TmpParts023)
-      .addReg(TmpPart1, RegState::Kill);
-
-  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
-    const char *SymName = Symbol.getSymbolName();
-    Part0.addExternalSymbol(SymName, MO0);
-    Part1.addExternalSymbol(SymName, MO1);
-    Part2.addExternalSymbol(SymName, MO2);
-    Part3.addExternalSymbol(SymName, MO3);
-  } else {
-    Part0.addDisp(Symbol, 0, MO0);
-    Part1.addDisp(Symbol, 0, MO1);
-    Part2.addDisp(Symbol, 0, MO2);
-    Part3.addDisp(Symbol, 0, MO3);
-  }
-
-  if (EraseFromParent)
-    MI.eraseFromParent();
-
-  return true;
-}
-
 bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%pc` family of
-    // relocs.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_PCREL_LO);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %pc_hi20(sym)
   // addi.w/d $rd, $rd, %pc_lo12(sym)
@@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, loading the result from GOT with `ldx.d` in the end.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                  LoongArchII::MO_GOT_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %got_pc_hi20(sym)
   // ld.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%ie_pc` family
-    // of relocs, loading the result with `ldx.d` in the end.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                  LoongArchII::MO_IE_PC_LO);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %ie_pc_hi20(sym)
   // ld.w/d $rd, $rd, %ie_pc_lo12(sym)
@@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_LD_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %ld_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_GD_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %gd_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -433,85 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
                                  SecondOpcode, LoongArchII::MO_GOT_PC_LO);
 }
 
-bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
-  MachineFunction *MF = MBB.getParent();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  const MachineOperand &Func = MI.getOperand(0);
-  MachineInstrBuilder CALL;
-  unsigned Opcode;
-
-  switch (MF->getTarget().getCodeModel()) {
-  default:
-    report_fatal_error("Unsupported code model");
-    break;
-  case CodeModel::Small: {
-    // CALL:
-    // bl func
-    // TAIL:
-    // b func
-    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
-    break;
-  }
-  case CodeModel::Medium: {
-    // CALL:
-    // pcaddu18i $ra, %call36(func)
-    // jirl      $ra, $ra, 0
-    // TAIL:
-    // pcaddu18i $scratch, %call36(func)
-    // jirl      $r0, $scratch, 0
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register ScratchReg =
-        IsTailCall
-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-            : LoongArch::R1;
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
-
-    CALL =
-        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
-
-    if (Func.isSymbol())
-      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
-    else
-      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
-    break;
-  }
-  case CodeModel::Large: {
-    // Emit the 5-insn large address load sequence, either directly or
-    // indirectly in case of going through the GOT, then JIRL_TAIL or
-    // JIRL_CALL to $addr.
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register AddrReg =
-        IsTailCall
-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-            : LoongArch::R1;
-
-    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
-    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
-    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
-    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
-                           false);
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
-    break;
-  }
-  }
-
-  // Transfer implicit operands.
-  CALL.copyImplicitOps(MI);
-
-  // Transfer MI flags.
-  CALL.setMIFlags(MI.getFlags());
-
-  MI.eraseFromParent();
-  return true;
-}
-
 class LoongArchExpandPseudo : public MachineFunctionPass {
 public:
   const LoongArchInstrInfo *TII;
@@ -533,6 +288,35 @@ class LoongArchExpandPseudo : public MachineFunctionPass {
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineBasicBlock::iterator &NextMBBI);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO,
+                              const MachineOperand &Symbol, Register DestReg,
+                              bool EraseFromParent);
+  bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressGotLarge(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandFunctionCALL(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineBasicBlock::iterator &NextMBBI,
+                          bool IsTailCall);
 };
 
 char LoongArchExpandPseudo::ID = 0;
@@ -567,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoCopyCFR:
     return expandCopyCFR(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_PCREL_LARGE:
+    return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_GOT_LARGE:
+    return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_IE_LARGE:
+    return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_LD_LARGE:
+    return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_GD_LARGE:
+    return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoCALL:
+  case LoongArch::PseudoCALL_MEDIUM:
+  case LoongArch::PseudoCALL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
+  case LoongArch::PseudoTAIL:
+  case LoongArch::PseudoTAIL_MEDIUM:
+  case LoongArch::PseudoTAIL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
 
   return false;
@@ -625,6 +427,213 @@ bool LoongArchExpandPseudo::expandCopyCFR(
   return true;
 }
 
+bool LoongArchExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO) {
+  MachineInstr &MI = *MBBI;
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
+                                MI.getOperand(2), MI.getOperand(0).getReg(),
+                                true);
+}
+
+bool LoongArchExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
+    bool EraseFromParent) {
+  // Code Sequence:
+  //
+  // Part1: pcalau12i  $dst, %MO1(sym)
+  // Part0: addi.d     $t8, $zero, %MO0(sym)
+  // Part2: lu32i.d    $t8, %MO2(sym)
+  // Part3: lu52i.d    $t8, $t8, %MO3(sym)
+  // Fin:   LastOpcode $dst, $t8, $dst
+
+  unsigned MO0, MO1, MO2, MO3;
+  switch (IdentifyingMO) {
+  default:
+    llvm_unreachable("unsupported identifying MO");
+  case LoongArchII::MO_PCREL_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_PCREL_HI;
+    MO2 = LoongArchII::MO_PCREL64_LO;
+    MO3 = LoongArchII::MO_PCREL64_HI;
+    break;
+  case LoongArchII::MO_GOT_PC_HI:
+  case LoongArchII::MO_LD_PC_HI:
+  case LoongArchII::MO_GD_PC_HI:
+    // These cases relocate just like the GOT case, except for Part1.
+    MO0 = LoongArchII::MO_GOT_PC_LO;
+    MO1 = IdentifyingMO;
+    MO2 = LoongArchII::MO_GOT_PC64_LO;
+    MO3 = LoongArchII::MO_GOT_PC64_HI;
+    break;
+  case LoongArchII::MO_IE_PC_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_IE_PC_HI;
+    MO2 = LoongArchII::MO_IE_PC64_LO;
+    MO3 = LoongArchII::MO_IE_PC64_HI;
+    break;
+  }
+
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  Register ScratchReg = LoongArch::R20; // $t8
+
+  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+         "Large code model requires LA64");
+
+  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
+  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
+                   .addReg(LoongArch::R0);
+  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
+                   // "rj" is needed due to InstrInfo pattern requirement.
+                   .addReg(ScratchReg);
+  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
+                   .addReg(ScratchReg);
+  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
+      .addReg(ScratchReg)
+      .addReg(DestReg);
+
+  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
+    const char *SymName = Symbol.getSymbolName();
+    Part0.addExternalSymbol(SymName, MO0);
+    Part1.addExternalSymbol(SymName, MO1);
+    Part2.addExternalSymbol(SymName, MO2);
+    Part3.addExternalSymbol(SymName, MO3);
+  } else {
+    Part0.addDisp(Symbol, 0, MO0);
+    Part1.addDisp(Symbol, 0, MO1);
+    Part2.addDisp(Symbol, 0, MO2);
+    Part3.addDisp(Symbol, 0, MO3);
+  }
+
+  if (EraseFromParent)
+    MI.eraseFromParent();
+
+  return true;
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%pc` family of
+  // relocs.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_PCREL_LO);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressGotLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, loading the result from GOT with `ldx.d` in the end.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                LoongArchII::MO_GOT_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%ie_pc` family
+  // of relocs, loading the result with `ldx.d` in the end.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                LoongArchII::MO_IE_PC_LO);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_LD_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_GD_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandFunctionCALL(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Func = MI.getOperand(0);
+  MachineInstrBuilder CALL;
+  unsigned Opcode;
+
+  switch (MF->getTarget().getCodeModel()) {
+  default:
+    report_fatal_error("Unsupported code model");
+    break;
+  case CodeModel::Small: {
+    // CALL:
+    // bl func
+    // TAIL:
+    // b func
+    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
+    break;
+  }
+  case CodeModel::Medium: {
+    // CALL:
+    // pcaddu18i  $ra, %call36(func)
+    // jirl       $ra, $ra, 0
+    // TAIL:
+    // pcaddu18i  $t8, %call36(func)
+    // jr         $t8
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1;
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+
+    CALL =
+        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+
+    if (Func.isSymbol())
+      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+    else
+      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+    break;
+  }
+  case CodeModel::Large: {
+    // Emit the 5-insn large address load sequence, either directly or
+    // indirectly in case of going through the GOT, then JIRL_TAIL or
+    // JIRL_CALL to $addr.
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1;
+
+    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
+    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
+    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
+    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
+                           false);
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
+    break;
+  }
+  }
+
+  // Transfer implicit operands.
+  CALL.copyImplicitOps(MI);
+
+  // Transfer MI flags.
+  CALL.setMIFlags(MI.getFlags());
+
+  MI.eraseFromParent();
+  return true;
+}
+
 } // end namespace
 
 INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index e14bbadf9ed22..2cfb2c1d78117 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -3381,8 +3381,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
     // TODO: Add more target-dependent nodes later.
     NODE_NAME_CASE(CALL)
+    NODE_NAME_CASE(CALL_MEDIUM)
+    NODE_NAME_CASE(CALL_LARGE)
     NODE_NAME_CASE(RET)
     NODE_NAME_CASE(TAIL)
+    NODE_NAME_CASE(TAIL_MEDIUM)
+    NODE_NAME_CASE(TAIL_LARGE)
     NODE_NAME_CASE(SLL_W)
     NODE_NAME_CASE(SRA_W)
     NODE_NAME_CASE(SRL_W)
@@ -4240,15 +4244,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  unsigned Op;
+  switch (DAG.getTarget().getCodeModel()) {
+  default:
+    report_fatal_error("Unsupported code model");
+  case CodeModel::Small:
+    Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL;
+    break;
+  case CodeModel::Medium:
+    assert(Subtarget.is64Bit() && "Medium code model requires LA64");
+    Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM;
+    break;
+  case CodeModel::Large:
+    assert(Subtarget.is64Bit() && "Large code model requires LA64");
+    Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE;
+    break;
+  }
 
   if (IsTailCall) {
     MF.getFrameInfo().setHasTailCall();
-    SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops);
+    SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops);
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     return Ret;
   }
 
-  Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(Op, DL, NodeTys, Ops);
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   Glue = Chain.getValue(1);
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 6f8878f9ccd51..2875aa82e424d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -27,8 +27,12 @@ enum NodeType : unsigned {
 
   // TODO: add more LoongArchISDs
   CALL,
+  CALL_MEDIUM,
+  CALL_LARGE,
   RET,
   TAIL,
+  TAIL_MEDIUM,
+  TAIL_LARGE,
 
   // 32-bit shifts, directly matching the semantics of the named LoongArch
   // instructions.
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 21555f8856300..78074c0128766 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
 def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall,
                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                              SDNPVariadic]>;
+def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall,
+                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                    SDNPVariadic]>;
+def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall,
+                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                    SDNPVariadic]>;
+def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall,
+                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                   SDNPVariadic]>;
+def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall,
+                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                   SDNPVariadic]>;
 def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
 def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
@@ -1399,16 +1411,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>;
 def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
 
+// Function call with 'Small' code model.
 let isCall = 1, Defs = [R1] in
 def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
 
 def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
 
+// Function call with 'Medium' code model.
+let isCall = 1, Defs = [R1, R20], Size = 8 in
+def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_medium tglobaladdr:$func),
+          (PseudoCALL_MEDIUM tglobaladdr:$func)>;
+def : Pat<(loongarch_call_medium texternalsym:$func),
+          (PseudoCALL_MEDIUM texternalsym:$func)>;
+} // Predicates = [IsLA64]
+
+// Function call with 'Large' code model.
+let isCall = 1, Defs = [R1, R20], Size = 24 in
+def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_large tglobaladdr:$func),
+          (PseudoCALL_LARGE tglobaladdr:$func)>;
+def : Pat<(loongarch_call_large texternalsym:$func),
+          (PseudoCALL_LARGE texternalsym:$func)>;
+} // Predicates = [IsLA64]
+
 let isCall = 1, Defs = [R1] in
 def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj),
                                 [(loongarch_call GPR:$rj)]>,
                          PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>;
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
+def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
+}
 
 let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = [R1] in
 def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
@@ -1419,6 +1458,7 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in
 def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
 
+// Tail call with 'Small' code model.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
@@ -1427,10 +1467,38 @@ def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
 def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),
           (PseudoTAIL texternalsym:$dst)>;
 
+// Tail call with 'Medium' code model.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    Uses = [R3], Defs = [R20], Size = 8 in
+def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)),
+          (PseudoTAIL_MEDIUM tglobaladdr:$dst)>;
+def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)),
+          (PseudoTAIL_MEDIUM texternalsym:$dst)>;
+} // Predicates = [IsLA64]
+
+// Tail call with 'Large' code model.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    Uses = [R3], Defs = [R19, R20], Size = 24 in
+def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)),
+          (PseudoTAIL_LARGE tglobaladdr:$dst)>;
+def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)),
+          (PseudoTAIL_LARGE texternalsym:$dst)>;
+} // Predicates = [IsLA64]
+
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj),
                                 [(loongarch_tail GPRT:$rj)]>,
                          PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>;
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
+def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
+}
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = [R3] in
@@ -1468,6 +1536,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),
                                 "la.abs", "$dst, $src">;
 def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                             "la.pcrel", "$dst, $src">;
+let Defs = [R20], Size = 20 in
 def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
                                   (ins GPR:$tmp, bare_symbol:$src), [],
                                   "la.pcrel", "$dst, $tmp, $src">,
@@ -1479,28 +1548,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in {
 def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                           "la.got", "$dst, $src">;
+def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.ie", "$dst, $src">;
+def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.ld", "$dst, $src">;
+def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.gd", "$dst, $src">;
+let Defs = [R20], Size = 20 in {
 def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
                                 (ins GPR:$tmp, bare_symbol:$src), [],
                                 "la.got", "$dst, $tmp, $src">,
                          Requires<[IsLA64]>;
-def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.ie", "$dst, $src">;
 def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ie", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.ld", "$dst, $src">;
 def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ld", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.gd", "$dst, $src">;
 def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.gd", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
+} // Defs = [R20], Size = 20
 }
 
 // Load address inst alias: "la", "la.global" and "la.local".
diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
index 7c6f46d5e9262..f93c316709284 100644
--- a/llvm/test/CodeGen/LoongArch/code-models.ll
+++ b/llvm/test/CodeGen/LoongArch/code-models.ll
@@ -33,11 +33,11 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; LARGE:       # %bb.0:
 ; LARGE-NEXT:    addi.d $sp, $sp, -16
 ; LARGE-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee)
-; LARGE-NEXT:    addi.d $ra, $zero, %got_pc_lo12(callee)
-; LARGE-NEXT:    lu32i.d $ra, %got64_pc_lo20(callee)
-; LARGE-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(callee)
-; LARGE-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE-NEXT:    pcalau12i $ra, %got_pc_hi20(callee)
+; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee)
+; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee)
+; LARGE-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -82,11 +82,11 @@ define void @call_external_sym(ptr %dst) {
 ; LARGE-NEXT:    .cfi_offset 1, -8
 ; LARGE-NEXT:    ori $a2, $zero, 1000
 ; LARGE-NEXT:    move $a1, $zero
-; LARGE-NEXT:    pcalau12i $a3, %pc_hi20(memset)
-; LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(memset)
-; LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(memset)
-; LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(memset)
-; LARGE-NEXT:    add.d $ra, $ra, $a3
+; LARGE-NEXT:    pcalau12i $ra, %pc_hi20(memset)
+; LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(memset)
+; LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(memset)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(memset)
+; LARGE-NEXT:    add.d $ra, $t8, $ra
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -105,17 +105,17 @@ define i32 @caller_tail(i32 %i) nounwind {
 ;
 ; MEDIUM-LABEL: caller_tail:
 ; MEDIUM:       # %bb.0: # %entry
-; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
-; MEDIUM-NEXT:    jr $a1
+; MEDIUM-NEXT:    pcaddu18i $t8, %call36(callee_tail)
+; MEDIUM-NEXT:    jr $t8
 ;
 ; LARGE-LABEL: caller_tail:
 ; LARGE:       # %bb.0: # %entry
-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee_tail)
-; LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(callee_tail)
-; LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(callee_tail)
-; LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(callee_tail)
-; LARGE-NEXT:    ldx.d $a1, $a2, $a1
-; LARGE-NEXT:    jr $a1
+; LARGE-NEXT:    pcalau12i $t7, %got_pc_hi20(callee_tail)
+; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee_tail)
+; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee_tail)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee_tail)
+; LARGE-NEXT:    ldx.d $t7, $t8, $t7
+; LARGE-NEXT:    jr $t7
 entry:
   %r = tail call i32 @callee_tail(i32 %i)
   ret i32 %r
diff --git a/llvm/test/CodeGen/LoongArch/expand-call.ll b/llvm/test/CodeGen/LoongArch/expand-call.ll
index 86bf4292665b7..e0d179f92de68 100644
--- a/llvm/test/CodeGen/LoongArch/expand-call.ll
+++ b/llvm/test/CodeGen/LoongArch/expand-call.ll
@@ -1,6 +1,6 @@
 ; RUN: llc --mtriple=loongarch64 --stop-before loongarch-prera-expand-pseudo \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=NOEXPAND
-; RUN: llc --mtriple=loongarch64 --stop-after loongarch-prera-expand-pseudo \
+; RUN: llc --mtriple=loongarch64 --stop-before machine-opt-remark-emitter \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=EXPAND
 
 declare void @callee()
diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
index a8f0ef648aa7c..d32a17f488b14 100644
--- a/llvm/test/CodeGen/LoongArch/global-address.ll
+++ b/llvm/test/CodeGen/LoongArch/global-address.ll
@@ -53,32 +53,32 @@ define void @foo() nounwind {
 ; LA64LARGENOPIC-LABEL: foo:
 ; LA64LARGENOPIC:       # %bb.0:
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
-; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LA64LARGENOPIC-NEXT:    add.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGENOPIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: foo:
 ; LA64LARGEPIC:       # %bb.0:
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGEPIC-NEXT:    ret
   %V = load volatile i32, ptr @G
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index 150a935d7bf82..d6f3e3469f751 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -52,13 +52,13 @@ define void @foo() nounwind {
 ; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
 ; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
 ; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
 ; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
+; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
@@ -78,41 +78,41 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
 ; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
 ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
-; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
-; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
-; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
-; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
-; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
+; LARGE_NO_SCH-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
@@ -122,42 +122,42 @@ define void @foo() nounwind {
 ; LARGE_SCH:       # %bb.0:
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
 ; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
-; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
 ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
+; LARGE_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
+; LARGE_SCH-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index a2a3792a6a54b..3994df1da7163 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -45,15 +45,15 @@ define ptr @f1() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %gd_pc_hi20(unspecified)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(unspecified)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(unspecified)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(unspecified)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(unspecified)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(unspecified)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(unspecified)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -76,10 +76,10 @@ define ptr @f1() nounwind {
 ; LA64LARGENOPIC-LABEL: f1:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(unspecified)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(unspecified)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(unspecified)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(unspecified)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(unspecified)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(unspecified)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(unspecified)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:
@@ -116,15 +116,15 @@ define ptr @f2() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -147,10 +147,10 @@ define ptr @f2() nounwind {
 ; LA64LARGENOPIC-LABEL: f2:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ define ptr @f3() nounwind {
 ; LA64LARGEPIC-LABEL: f3:
 ; LA64LARGEPIC:       # %bb.0: # %entry
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGEPIC-NEXT:    ret
 ;
@@ -201,10 +201,10 @@ define ptr @f3() nounwind {
 ; LA64LARGENOPIC-LABEL: f3:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:

From f68647997b828eeaa580e74e1d7c8565a9ecd215 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=C3=A1vid=20Ferenc=20Szab=C3=B3?=
 <30732159+dfszabo@users.noreply.github.com>
Date: Fri, 5 Jan 2024 04:13:39 +0100
Subject: [PATCH 306/313] =?UTF-8?q?[GlobalISel]=20Adding=20support=20for?=
 =?UTF-8?q?=20handling=20G=5FASSERT=5F{SEXT,ZEXT,ALIGN}=20i=E2=80=A6=20(#7?=
 =?UTF-8?q?4196)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…n artifact combiner

These instructions are hint for optimizations and can be treated as
copies and are handled as such with this change. Without it is possible
to run into an assertion, since tryCombineUnmergeValues rightfully use
getDefIgnoringCopies to get the source MI, which already handle these
hint instructions and treat them as copies. The problem is that
markDefDead only considers COPYs, which will lead to crash with
assertion for cases like testUnmergeHintOfTrunc.
---
 .../GlobalISel/LegalizationArtifactCombiner.h | 21 +++---
 .../irtranslator-hoisted-constants.ll         |  5 +-
 .../AArch64/GlobalISel/legalize-simple.mir    | 64 +++++++++++++++++++
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 851353042cc26..da330b517c280 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -1366,6 +1366,9 @@ class LegalizationArtifactCombiner {
           // Adding Use to ArtifactList.
           WrapperObserver.changedInstr(Use);
           break;
+        case TargetOpcode::G_ASSERT_SEXT:
+        case TargetOpcode::G_ASSERT_ZEXT:
+        case TargetOpcode::G_ASSERT_ALIGN:
         case TargetOpcode::COPY: {
           Register Copy = Use.getOperand(0).getReg();
           if (Copy.isVirtual())
@@ -1392,6 +1395,9 @@ class LegalizationArtifactCombiner {
     case TargetOpcode::G_ANYEXT:
     case TargetOpcode::G_SEXT:
     case TargetOpcode::G_EXTRACT:
+    case TargetOpcode::G_ASSERT_SEXT:
+    case TargetOpcode::G_ASSERT_ZEXT:
+    case TargetOpcode::G_ASSERT_ALIGN:
       return MI.getOperand(1).getReg();
     case TargetOpcode::G_UNMERGE_VALUES:
       return MI.getOperand(MI.getNumOperands() - 1).getReg();
@@ -1425,7 +1431,8 @@ class LegalizationArtifactCombiner {
       if (MRI.hasOneUse(PrevRegSrc)) {
         if (TmpDef != &DefMI) {
           assert((TmpDef->getOpcode() == TargetOpcode::COPY ||
-                  isArtifactCast(TmpDef->getOpcode())) &&
+                  isArtifactCast(TmpDef->getOpcode()) ||
+                  isPreISelGenericOptimizationHint(TmpDef->getOpcode())) &&
                  "Expecting copy or artifact cast here");
 
           DeadInsts.push_back(TmpDef);
@@ -1509,16 +1516,8 @@ class LegalizationArtifactCombiner {
   /// Looks through copy instructions and returns the actual
   /// source register.
   Register lookThroughCopyInstrs(Register Reg) {
-    using namespace llvm::MIPatternMatch;
-
-    Register TmpReg;
-    while (mi_match(Reg, MRI, m_Copy(m_Reg(TmpReg)))) {
-      if (MRI.getType(TmpReg).isValid())
-        Reg = TmpReg;
-      else
-        break;
-    }
-    return Reg;
+    Register TmpReg = getSrcRegIgnoringCopies(Reg, MRI);
+    return TmpReg.isValid() ? TmpReg : Reg;
   }
 };
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
index a7d7a1e81617e..5867326c18aa6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
@@ -46,14 +46,11 @@ define i32 @test(i32 %a, i1 %c) {
   ; PRESELECTION-NEXT: {{  $}}
   ; PRESELECTION-NEXT:   [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
   ; PRESELECTION-NEXT:   [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1
-  ; PRESELECTION-NEXT:   [[TRUNC:%[0-9]+]]:gpr(s8) = G_TRUNC [[COPY1]](s32)
-  ; PRESELECTION-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:gpr(s8) = G_ASSERT_ZEXT [[TRUNC]], 1
   ; PRESELECTION-NEXT:   [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
   ; PRESELECTION-NEXT:   [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
   ; PRESELECTION-NEXT:   [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C1]]
-  ; PRESELECTION-NEXT:   [[ANYEXT:%[0-9]+]]:gpr(s32) = G_ANYEXT [[ASSERT_ZEXT]](s8)
   ; PRESELECTION-NEXT:   [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
-  ; PRESELECTION-NEXT:   [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ANYEXT]], [[C2]]
+  ; PRESELECTION-NEXT:   [[AND:%[0-9]+]]:gpr(s32) = G_AND [[COPY1]], [[C2]]
   ; PRESELECTION-NEXT:   G_BRCOND [[AND]](s32), %bb.3
   ; PRESELECTION-NEXT:   G_BR %bb.2
   ; PRESELECTION-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
index 62865bcfdf081..ce02aa380efe1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
@@ -136,6 +136,47 @@ body:             |
     $x0 = COPY %3(s64)
     RET_ReallyLR implicit $x0
 
+...
+---
+name:            testExtOfHintOfTrunc
+body:             |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: testExtOfHintOfTrunc
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: $x0 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %1:_(s8) = G_TRUNC %0(s64)
+    %2:_(s8) = G_ASSERT_ZEXT %1(s8), 1
+    %3:_(s64) = G_ANYEXT %2(s8)
+    $x0 = COPY %3(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            testExtOfCopyAndHintOfTrunc
+body:             |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: testExtOfCopyAndHintOfTrunc
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: $x0 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %1:_(s8) = G_TRUNC %0(s64)
+    %2:_(s8) = COPY %1(s8)
+    %3:_(s8) = G_ASSERT_ZEXT %2(s8), 1
+    %4:_(s64) = G_ANYEXT %3(s8)
+    $x0 = COPY %4(s64)
+    RET_ReallyLR implicit $x0
+
 ...
 ---
 name:            testExtOf2CopyOfTrunc
@@ -158,3 +199,26 @@ body:             |
     RET_ReallyLR implicit $x0
 
 ...
+---
+name:            testUnmergeHintOfTrunc
+body:             |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: testUnmergeHintOfTrunc
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV1]](s8)
+    ; CHECK-NEXT: $x0 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %1:_(s16) = G_TRUNC %0(s64)
+    %2:_(s16) = G_ASSERT_ZEXT %1(s16), 1
+    %3:_(s8), %4:_(s8) = G_UNMERGE_VALUES %2(s16)
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
+    RET_ReallyLR implicit $x0
+
+...

From b4cfb50c65bd3893d1a0639deb33bd664214a20f Mon Sep 17 00:00:00 2001
From: Michal Paszkowski <michal.paszkowski@outlook.com>
Date: Thu, 4 Jan 2024 19:31:15 -0800
Subject: [PATCH 307/313] [SPIR-V] Emit SPIR-V bitcasts between source/expected
 pointer type (#69621)

This patch introduces a new spv_ptrcast intrinsic for tracking expected
pointer types. The change fixes multiple OpenCL CTS regressions due the
switch to opaque pointers (e.g. basic/hiloeo).
---
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |   1 +
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp   |   7 +-
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 125 +++++++++++++++++-
 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp   |  24 +++-
 .../CodeGen/SPIRV/AtomicCompareExchange.ll    |  31 ++---
 .../SPIRV/function/alloca-load-store.ll       |  22 +--
 .../undef-nested-composite-store.ll           |   3 +-
 .../undef-simple-composite-store.ll           |   3 +-
 .../CodeGen/SPIRV/llvm-intrinsics/memset.ll   |   4 +-
 llvm/test/CodeGen/SPIRV/opaque_pointers.ll    |  20 +--
 .../SPIRV/opencl/basic/get_global_offset.ll   |   8 +-
 ...er-type-deduction-no-bitcast-to-generic.ll |  19 +++
 .../SPIRV/pointers/two-bitcast-users.ll       |  19 +++
 .../SPIRV/pointers/two-subsequent-bitcasts.ll |  17 +++
 llvm/test/CodeGen/SPIRV/sitofp-with-bool.ll   |  15 ++-
 .../OpPhi_ArgumentsPlaceholders.ll            |   1 +
 .../CodeGen/SPIRV/transcoding/spec_const.ll   |   1 +
 .../spirv-private-array-initialization.ll     |   6 +-
 llvm/test/CodeGen/SPIRV/uitofp-with-bool.ll   |   4 +-
 19 files changed, 264 insertions(+), 66 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/two-bitcast-users.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 736be8ca3212b..ea0074d22a441 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -28,6 +28,7 @@ let TargetPrefix = "spv" in {
   def int_spv_insertelt : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_any_ty, llvm_anyint_ty]>;
   def int_spv_const_composite : Intrinsic<[llvm_i32_ty], [llvm_vararg_ty]>;
   def int_spv_bitcast : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
+  def int_spv_ptrcast : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_metadata_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
   def int_spv_switch : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_cmpxchg : Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_unreachable : Intrinsic<[], []>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 629db8e2eb4d0..0a8b5499a1fc2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -211,8 +211,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
 
   MDString *MDKernelArgType =
       getKernelArgAttribute(F, ArgIdx, "kernel_arg_type");
-  if (!MDKernelArgType || (MDKernelArgType->getString().ends_with("*") &&
-                           MDKernelArgType->getString().ends_with("_t")))
+  if (!MDKernelArgType || (!MDKernelArgType->getString().ends_with("*") &&
+                           !MDKernelArgType->getString().ends_with("_t")))
     return GR->getOrCreateSPIRVType(OriginalArgType, MIRBuilder, ArgAccessQual);
 
   if (MDKernelArgType->getString().ends_with("*"))
@@ -438,7 +438,8 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs");
       ArgVRegs.push_back(Arg.Regs[0]);
       SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(Arg.Ty, MIRBuilder);
-      GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF());
+      if (!GR->getSPIRVTypeForVReg(Arg.Regs[0]))
+        GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF());
     }
     if (auto Res = SPIRV::lowerBuiltin(
             DemangledName, SPIRV::InstructionSet::OpenCL_std, MIRBuilder,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 660c574daf38f..fb4e9932dd2dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -74,6 +74,7 @@ class SPIRVEmitIntrinsics
   void processInstrAfterVisit(Instruction *I);
   void insertAssignPtrTypeIntrs(Instruction *I);
   void insertAssignTypeIntrs(Instruction *I);
+  void insertPtrCastInstr(Instruction *I);
   void processGlobalValue(GlobalVariable &GV);
 
 public:
@@ -255,7 +256,19 @@ Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) {
 }
 
 Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
-  SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()};
+  Value *Source = I.getOperand(0);
+
+  // SPIR-V, contrary to LLVM 17+ IR, supports bitcasts between pointers of
+  // varying element types. In case of IR coming from older versions of LLVM
+  // such bitcasts do not provide sufficient information, should be just skipped
+  // here, and handled in insertPtrCastInstr.
+  if (I.getType()->isPointerTy()) {
+    I.replaceAllUsesWith(Source);
+    I.eraseFromParent();
+    return nullptr;
+  }
+
+  SmallVector<Type *, 2> Types = {I.getType(), Source->getType()};
   SmallVector<Value *> Args(I.op_begin(), I.op_end());
   auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_bitcast, {Types}, {Args});
   std::string InstName = I.hasName() ? I.getName().str() : "";
@@ -265,6 +278,111 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
   return NewI;
 }
 
+void SPIRVEmitIntrinsics::insertPtrCastInstr(Instruction *I) {
+  Value *Pointer;
+  Type *ExpectedElementType;
+  unsigned OperandToReplace;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Pointer = SI->getPointerOperand();
+    ExpectedElementType = SI->getValueOperand()->getType();
+    OperandToReplace = 1;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Pointer = LI->getPointerOperand();
+    ExpectedElementType = LI->getType();
+    OperandToReplace = 0;
+  } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+    Pointer = GEPI->getPointerOperand();
+    ExpectedElementType = GEPI->getSourceElementType();
+    OperandToReplace = 0;
+  } else {
+    return;
+  }
+
+  // If Pointer is the result of nop BitCastInst (ptr -> ptr), use the source
+  // pointer instead. The BitCastInst should be later removed when visited.
+  while (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer))
+    Pointer = BC->getOperand(0);
+
+  // Do not emit spv_ptrcast if Pointer is a GlobalValue of expected type.
+  GlobalValue *GV = dyn_cast<GlobalValue>(Pointer);
+  if (GV && GV->getValueType() == ExpectedElementType)
+    return;
+
+  // Do not emit spv_ptrcast if Pointer is a result of alloca with expected
+  // type.
+  AllocaInst *A = dyn_cast<AllocaInst>(Pointer);
+  if (A && A->getAllocatedType() == ExpectedElementType)
+    return;
+
+  if (dyn_cast<GetElementPtrInst>(Pointer))
+    return;
+
+  setInsertPointSkippingPhis(*IRB, I);
+  Constant *ExpectedElementTypeConst =
+      Constant::getNullValue(ExpectedElementType);
+  ConstantAsMetadata *CM =
+      ValueAsMetadata::getConstant(ExpectedElementTypeConst);
+  MDTuple *TyMD = MDNode::get(F->getContext(), CM);
+  MetadataAsValue *VMD = MetadataAsValue::get(F->getContext(), TyMD);
+  unsigned AddressSpace = Pointer->getType()->getPointerAddressSpace();
+  bool FirstPtrCastOrAssignPtrType = true;
+
+  // Do not emit new spv_ptrcast if equivalent one already exists or when
+  // spv_assign_ptr_type already targets this pointer with the same element
+  // type.
+  for (auto User : Pointer->users()) {
+    auto *II = dyn_cast<IntrinsicInst>(User);
+    if (!II ||
+        (II->getIntrinsicID() != Intrinsic::spv_assign_ptr_type &&
+         II->getIntrinsicID() != Intrinsic::spv_ptrcast) ||
+        II->getOperand(0) != Pointer)
+      continue;
+
+    // There is some spv_ptrcast/spv_assign_ptr_type already targeting this
+    // pointer.
+    FirstPtrCastOrAssignPtrType = false;
+    if (II->getOperand(1) != VMD ||
+        dyn_cast<ConstantInt>(II->getOperand(2))->getSExtValue() !=
+            AddressSpace)
+      continue;
+
+    // The spv_ptrcast/spv_assign_ptr_type targeting this pointer is of the same
+    // element type and address space.
+    if (II->getIntrinsicID() != Intrinsic::spv_ptrcast)
+      return;
+
+    // This must be a spv_ptrcast, do not emit new if this one has the same BB
+    // as I. Otherwise, search for other spv_ptrcast/spv_assign_ptr_type.
+    if (II->getParent() != I->getParent())
+      continue;
+
+    I->setOperand(OperandToReplace, II);
+    return;
+  }
+
+  // Do not emit spv_ptrcast if it would cast to the default pointer element
+  // type (i8) of the same address space.
+  if (ExpectedElementType->isIntegerTy(8))
+    return;
+
+  // If this would be the first spv_ptrcast and there is no spv_assign_ptr_type
+  // for this pointer before, do not emit spv_ptrcast but emit
+  // spv_assign_ptr_type instead.
+  if (FirstPtrCastOrAssignPtrType && isa<Instruction>(Pointer)) {
+    buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Pointer->getType()},
+                    ExpectedElementTypeConst, Pointer,
+                    {IRB->getInt32(AddressSpace)});
+    return;
+  } else {
+    SmallVector<Type *, 2> Types = {Pointer->getType(), Pointer->getType()};
+    SmallVector<Value *, 2> Args = {Pointer, VMD, IRB->getInt32(AddressSpace)};
+    auto *PtrCastI =
+        IRB->CreateIntrinsic(Intrinsic::spv_ptrcast, {Types}, Args);
+    I->setOperand(OperandToReplace, PtrCastI);
+    return;
+  }
+}
+
 Instruction *SPIRVEmitIntrinsics::visitInsertElementInst(InsertElementInst &I) {
   SmallVector<Type *, 4> Types = {I.getType(), I.getOperand(0)->getType(),
                                   I.getOperand(1)->getType(),
@@ -522,13 +640,18 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   for (auto &I : Worklist) {
     insertAssignPtrTypeIntrs(I);
     insertAssignTypeIntrs(I);
+    insertPtrCastInstr(I);
   }
 
   for (auto *I : Worklist) {
     TrackConstants = true;
     if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
       IRB->SetInsertPoint(I->getNextNode());
+    // Visitors return either the original/newly created instruction for further
+    // processing, nullptr otherwise.
     I = visit(*I);
+    if (!I)
+      continue;
     processInstrAfterVisit(I);
   }
   return true;
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 1bfce70fedc0e..429b08e199cdf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -125,12 +125,32 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast))
+      if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) &&
+          !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))
         continue;
       assert(MI.getOperand(2).isReg());
       MIB.setInsertPt(*MI.getParent(), MI);
-      MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
       ToErase.push_back(&MI);
+      if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) {
+        MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
+        continue;
+      }
+      Register Def = MI.getOperand(0).getReg();
+      Register Source = MI.getOperand(2).getReg();
+      SPIRVType *BaseTy = GR->getOrCreateSPIRVType(
+          getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB);
+      SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
+          BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
+          addressSpaceToStorageClass(MI.getOperand(4).getImm()));
+
+      // If the bitcast would be redundant, replace all uses with the source
+      // register.
+      if (GR->getSPIRVTypeForVReg(Source) == AssignedPtrType) {
+        MIB.getMRI()->replaceRegWith(Def, Source);
+      } else {
+        GR->assignSPIRVTypeToVReg(AssignedPtrType, Def, MF);
+        MIB.buildBitcast(Def, Source);
+      }
     }
   }
   for (MachineInstr *MI : ToErase)
diff --git a/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll b/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
index 8d12414c66a8f..323afec7f35f8 100644
--- a/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
+++ b/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
@@ -10,47 +10,40 @@
 ; CHECK-SPIRV-DAG:  %[[#Struct:]] = OpTypeStruct %[[#Int]] %[[#Bool]]
 ; CHECK-SPIRV-DAG:  %[[#UndefStruct:]] = OpUndef %[[#Struct]]
 
-; CHECK-SPIRV:      %[[#Pointer:]] = OpFunctionParameter %[[#]]
-; CHECK-SPIRV:      %[[#Value_ptr:]] = OpFunctionParameter %[[#]]
-; CHECK-SPIRV:      %[[#Comparator:]] = OpFunctionParameter %[[#]]
-
-; CHECK-SPIRV:      %[[#Value:]] = OpLoad %[[#Int]] %[[#Value_ptr]]
-; CHECK-SPIRV:      %[[#Res:]] = OpAtomicCompareExchange %[[#Int]] %[[#Pointer]] %[[#MemScope_Device]]
-; CHECK-SPIRV-SAME: %[[#MemSemEqual_SeqCst]] %[[#MemSemUnequal_Acquire]] %[[#Value]] %[[#Comparator]]
+; CHECK-SPIRV:      %[[#Value:]] = OpLoad %[[#Int]] %[[#Value_ptr:]]
+; CHECK-SPIRV:      %[[#Res:]] = OpAtomicCompareExchange %[[#Int]] %[[#Pointer:]] %[[#MemScope_Device]]
+; CHECK-SPIRV-SAME: %[[#MemSemEqual_SeqCst]] %[[#MemSemUnequal_Acquire]] %[[#Value]] %[[#Comparator:]]
 ; CHECK-SPIRV:      %[[#Success:]] = OpIEqual %[[#]] %[[#Res]] %[[#Comparator]]
 ; CHECK-SPIRV:      %[[#Composite_0:]] = OpCompositeInsert %[[#Struct]] %[[#Res]] %[[#UndefStruct]] 0
 ; CHECK-SPIRV:      %[[#Composite_1:]] = OpCompositeInsert %[[#Struct]] %[[#Success]] %[[#Composite_0]] 1
 ; CHECK-SPIRV:      %[[#]] = OpCompositeExtract %[[#Bool]] %[[#Composite_1]] 1
 
-define dso_local spir_func void @test(i32* %ptr, i32* %value_ptr, i32 %comparator) local_unnamed_addr {
+define dso_local spir_func void @test(ptr %ptr, ptr %value_ptr, i32 %comparator) local_unnamed_addr {
 entry:
-  %0 = load i32, i32* %value_ptr, align 4
-  %1 = cmpxchg i32* %ptr, i32 %comparator, i32 %0 seq_cst acquire
+  %0 = load i32, ptr %value_ptr, align 4
+  %1 = cmpxchg ptr %ptr, i32 %comparator, i32 %0 seq_cst acquire
   %2 = extractvalue { i32, i1 } %1, 1
   br i1 %2, label %cmpxchg.continue, label %cmpxchg.store_expected
 
 cmpxchg.store_expected:                           ; preds = %entry
   %3 = extractvalue { i32, i1 } %1, 0
-  store i32 %3, i32* %value_ptr, align 4
+  store i32 %3, ptr %value_ptr, align 4
   br label %cmpxchg.continue
 
 cmpxchg.continue:                                 ; preds = %cmpxchg.store_expected, %entry
   ret void
 }
 
-; CHECK-SPIRV:      %[[#Ptr:]] = OpFunctionParameter %[[#]]
-; CHECK-SPIRV:      %[[#Store_ptr:]] = OpFunctionParameter %[[#]]
-
-; CHECK-SPIRV:      %[[#Res_1:]] = OpAtomicCompareExchange %[[#Int]] %[[#Ptr]] %[[#MemScope_Device]]
+; CHECK-SPIRV:      %[[#Res_1:]] = OpAtomicCompareExchange %[[#Int]] %[[#Ptr:]] %[[#MemScope_Device]]
 ; CHECK-SPIRV-SAME: %[[#MemSemEqual_SeqCst]] %[[#MemSemUnequal_Acquire]] %[[#Constant_456]] %[[#Constant_128]]
 ; CHECK-SPIRV:      %[[#Success_1:]] = OpIEqual %[[#]] %[[#Res_1]] %[[#Constant_128]]
 ; CHECK-SPIRV:      %[[#Composite:]] = OpCompositeInsert %[[#Struct]] %[[#Res_1]] %[[#UndefStruct]] 0
 ; CHECK-SPIRV:      %[[#Composite_1:]] = OpCompositeInsert %[[#Struct]] %[[#Success_1]] %[[#Composite]] 1
-; CHECK-SPIRV:      OpStore %[[#Store_ptr]] %[[#Composite_1]]
+; CHECK-SPIRV:      OpStore %[[#Store_ptr:]] %[[#Composite_1]]
 
-define dso_local spir_func void @test2(i32* %ptr, {i32, i1}* %store_ptr) local_unnamed_addr {
+define dso_local spir_func void @test2(ptr %ptr, ptr %store_ptr) local_unnamed_addr {
 entry:
-  %0 = cmpxchg i32* %ptr, i32 128, i32 456 seq_cst acquire
-  store { i32, i1 } %0, { i32, i1 }* %store_ptr, align 4
+  %0 = cmpxchg ptr %ptr, i32 128, i32 456 seq_cst acquire
+  store { i32, i1 } %0, ptr %store_ptr, align 4
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll b/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll
index 5e76eb11de633..c64a708b28ebe 100644
--- a/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll
+++ b/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll
@@ -1,17 +1,16 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 
-target triple = "spirv32-unknown-unknown"
-
 ; CHECK-DAG: OpName %[[#BAR:]] "bar"
 ; CHECK-DAG: OpName %[[#FOO:]] "foo"
 ; CHECK-DAG: OpName %[[#GOO:]] "goo"
 
 ; CHECK-DAG: %[[#CHAR:]] = OpTypeInt 8
 ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32
-; CHECK-DAG: %[[#STACK_PTR:]] = OpTypePointer Function %[[#INT]]
-; CHECK-DAG: %[[#GLOBAL_PTR:]] = OpTypePointer CrossWorkgroup %[[#CHAR]]
+; CHECK-DAG: %[[#STACK_PTR_INT:]] = OpTypePointer Function %[[#INT]]
+; CHECK-DAG: %[[#GLOBAL_PTR_INT:]] = OpTypePointer CrossWorkgroup %[[#INT]]
+; CHECK-DAG: %[[#GLOBAL_PTR_CHAR:]] = OpTypePointer CrossWorkgroup %[[#CHAR]]
 ; CHECK-DAG: %[[#FN1:]] = OpTypeFunction %[[#INT]] %[[#INT]]
-; CHECK-DAG: %[[#FN2:]] = OpTypeFunction %[[#INT]] %[[#INT]] %[[#GLOBAL_PTR]]
+; CHECK-DAG: %[[#FN2:]] = OpTypeFunction %[[#INT]] %[[#INT]] %[[#GLOBAL_PTR_CHAR]]
 
 define i32 @bar(i32 %a) {
   %p = alloca i32
@@ -23,7 +22,7 @@ define i32 @bar(i32 %a) {
 ; CHECK: %[[#BAR]] = OpFunction %[[#INT]] None %[[#FN1]]
 ; CHECK: %[[#A:]] = OpFunctionParameter %[[#INT]]
 ; CHECK: OpLabel
-; CHECK: %[[#P:]] = OpVariable %[[#STACK_PTR]] Function
+; CHECK: %[[#P:]] = OpVariable %[[#STACK_PTR_INT]] Function
 ; CHECK: OpStore %[[#P]] %[[#A]]
 ; CHECK: %[[#B:]] = OpLoad %[[#INT]] %[[#P]]
 ; CHECK: OpReturnValue %[[#B]]
@@ -40,7 +39,7 @@ define i32 @foo(i32 %a) {
 ; CHECK: %[[#FOO]] = OpFunction %[[#INT]] None %[[#FN1]]
 ; CHECK: %[[#A:]] = OpFunctionParameter %[[#INT]]
 ; CHECK: OpLabel
-; CHECK: %[[#P:]] = OpVariable %[[#STACK_PTR]] Function
+; CHECK: %[[#P:]] = OpVariable %[[#STACK_PTR_INT]] Function
 ; CHECK: OpStore %[[#P]] %[[#A]] Volatile
 ; CHECK: %[[#B:]] = OpLoad %[[#INT]] %[[#P]] Volatile
 ; CHECK: OpReturnValue %[[#B]]
@@ -48,7 +47,7 @@ define i32 @foo(i32 %a) {
 
 
 ;; Test load and store in global address space.
-define i32 @goo(i32 %a, i32 addrspace(1)* %p) {
+define i32 @goo(i32 %a, ptr addrspace(1) %p) {
   store i32 %a, i32 addrspace(1)* %p
   %b = load i32, i32 addrspace(1)* %p
   ret i32 %b
@@ -56,9 +55,10 @@ define i32 @goo(i32 %a, i32 addrspace(1)* %p) {
 
 ; CHECK: %[[#GOO]] = OpFunction %[[#INT]] None %[[#FN2]]
 ; CHECK: %[[#A:]] = OpFunctionParameter %[[#INT]]
-; CHECK: %[[#P:]] = OpFunctionParameter %[[#GLOBAL_PTR]]
+; CHECK: %[[#P:]] = OpFunctionParameter %[[#GLOBAL_PTR_CHAR]]
 ; CHECK: OpLabel
-; CHECK: OpStore %[[#P]] %[[#A]]
-; CHECK: %[[#B:]] = OpLoad %[[#INT]] %[[#P]]
+; CHECK: %[[#C:]] = OpBitcast %[[#GLOBAL_PTR_INT]] %[[#P]]
+; CHECK: OpStore %[[#C]] %[[#A]]
+; CHECK: %[[#B:]] = OpLoad %[[#INT]] %[[#C]]
 ; CHECK: OpReturnValue %[[#B]]
 ; CHECK: OpFunctionEnd
diff --git a/llvm/test/CodeGen/SPIRV/instructions/undef-nested-composite-store.ll b/llvm/test/CodeGen/SPIRV/instructions/undef-nested-composite-store.ll
index 9fa94b8484631..5fa4a7a93ee0b 100644
--- a/llvm/test/CodeGen/SPIRV/instructions/undef-nested-composite-store.ll
+++ b/llvm/test/CodeGen/SPIRV/instructions/undef-nested-composite-store.ll
@@ -9,7 +9,8 @@
 ; CHECK: %[[#]] = OpFunction %[[#]] None %[[#]]
 ; CHECK-NEXT: %[[#PTR:]] = OpFunctionParameter %[[#]]
 ; CHECK-NEXT: %[[#]] = OpLabel
-; CHECK-NEXT: OpStore %[[#PTR]] %[[#UNDEF]] Aligned 4
+; CHECK-NEXT: %[[#BC:]] = OpBitcast %[[#]] %[[#PTR]]
+; CHECK-NEXT: OpStore %[[#BC]] %[[#UNDEF]] Aligned 4
 ; CHECK-NEXT: OpReturn
 ; CHECK-NEXT: OpFunctionEnd
 
diff --git a/llvm/test/CodeGen/SPIRV/instructions/undef-simple-composite-store.ll b/llvm/test/CodeGen/SPIRV/instructions/undef-simple-composite-store.ll
index 6a274732417d4..24dad10382ada 100644
--- a/llvm/test/CodeGen/SPIRV/instructions/undef-simple-composite-store.ll
+++ b/llvm/test/CodeGen/SPIRV/instructions/undef-simple-composite-store.ll
@@ -8,7 +8,8 @@
 ; CHECK: %[[#]] = OpFunction %[[#]] None %[[#]]
 ; CHECK-NEXT: %[[#PTR:]] = OpFunctionParameter %[[#]]
 ; CHECK-NEXT: %[[#]] = OpLabel
-; CHECK-NEXT: OpStore %[[#PTR]] %[[#UNDEF]] Aligned 4
+; CHECK-NEXT: %[[#BC:]] = OpBitcast %[[#]] %[[#PTR]]
+; CHECK-NEXT: OpStore %[[#BC]] %[[#UNDEF]] Aligned 4
 ; CHECK-NEXT: OpReturn
 ; CHECK-NEXT: OpFunctionEnd
 
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
index 3d7d9fbf90759..e7a986980f250 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
@@ -25,9 +25,7 @@
 ; CHECK: %[[#VarNull:]] = OpVariable %[[#]] UniformConstant %[[#ConstNull]]
 
 ; CHECK-DAG: %[[#Int8PtrConst:]] = OpTypePointer UniformConstant %[[#Int8]]
-; CHECK: %[[#Target:]] = OpBitcast %[[#Int8Ptr]] %[[#]]
-; CHECK: %[[#Source:]] = OpBitcast %[[#Int8PtrConst]] %[[#VarNull]]
-; CHECK: OpCopyMemorySized %[[#Target]] %[[#Source]] %[[#Const12]] Aligned 4
+; CHECK: OpCopyMemorySized %[[#Target:]] %[[#Source:]] %[[#Const12]] Aligned 4
 
 ; CHECK: %[[#SourceComp:]] = OpBitcast %[[#Int8PtrConst]] %[[#VarComp]]
 ; CHECK: OpCopyMemorySized %[[#]] %[[#SourceComp]] %[[#Const4]] Aligned 4
diff --git a/llvm/test/CodeGen/SPIRV/opaque_pointers.ll b/llvm/test/CodeGen/SPIRV/opaque_pointers.ll
index aaddea1372721..30c9d8dc774e2 100644
--- a/llvm/test/CodeGen/SPIRV/opaque_pointers.ll
+++ b/llvm/test/CodeGen/SPIRV/opaque_pointers.ll
@@ -1,19 +1,23 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK
 
 ; CHECK-DAG: %[[#Int8Ty:]] = OpTypeInt 8 0
-; CHECK-DAG: %[[#PtrTy:]] = OpTypePointer Function %[[#Int8Ty]]
-; CHECK-DAG: %[[#Int64Ty:]] = OpTypeInt 64 0
-; CHECK-DAG: %[[#FTy:]] = OpTypeFunction %[[#Int64Ty]] %[[#PtrTy]]
+; CHECK-DAG: %[[#PtrInt8Ty:]] = OpTypePointer Function %[[#Int8Ty]]
 ; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#PtrInt32Ty:]] = OpTypePointer Function %[[#Int32Ty]]
+; CHECK-DAG: %[[#Int64Ty:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#PtrInt64Ty:]] = OpTypePointer Function %[[#Int64Ty]]
+; CHECK-DAG: %[[#FTy:]] = OpTypeFunction %[[#Int64Ty]] %[[#PtrInt8Ty]]
 ; CHECK-DAG: %[[#Const:]] = OpConstant %[[#Int32Ty]] 0
 ; CHECK: OpFunction %[[#Int64Ty]] None %[[#FTy]]
-; CHECK: %[[#Parm:]] = OpFunctionParameter %[[#PtrTy]]
-; CHECK: OpStore %[[#Parm]] %[[#Const]] Aligned 4
-; CHECK: %[[#Res:]] = OpLoad %[[#Int64Ty]] %[[#Parm]] Aligned 8
+; CHECK: %[[#Parm:]] = OpFunctionParameter %[[#PtrInt8Ty]]
+; CHECK-DAG: %[[#Bitcast1:]] = OpBitcast %[[#PtrInt32Ty]] %[[#Parm]]
+; CHECK: OpStore %[[#Bitcast1]] %[[#Const]] Aligned 4
+; CHECK-DAG: %[[#Bitcast2:]] = OpBitcast %[[#PtrInt64Ty]] %[[#Parm]]
+; CHECK: %[[#Res:]] = OpLoad %[[#Int64Ty]] %[[#Bitcast2]] Aligned 4
 ; CHECK: OpReturnValue %[[#Res]]
 
 define i64 @test(ptr %p) {
-  store i32 0, ptr %p
-  %v = load i64, ptr %p
+  store i32 0, ptr %p, align 4
+  %v = load i64, ptr %p, align 4
   ret i64 %v
 }
diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/get_global_offset.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/get_global_offset.ll
index 127804671cee4..43d6238360f6f 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/basic/get_global_offset.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/basic/get_global_offset.ll
@@ -1,10 +1,7 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
 
 ; CHECK: OpEntryPoint Kernel %[[#test_func:]] "test"
-; CHECK: OpName %[[#outOffsets:]] "outOffsets"
-; CHECK: OpName %[[#test_func]] "test"
-; CHECK: OpName %[[#f2_decl:]] "BuiltInGlobalOffset"
-; CHECK: OpDecorate %[[#f2_decl]] LinkageAttributes "BuiltInGlobalOffset" Import
+; CHECK: OpDecorate %[[#f2_decl:]] LinkageAttributes "BuiltInGlobalOffset" Import
 ; CHECK: %[[#int_ty:]] = OpTypeInt 8 0
 ; CHECK: %[[#void_ty:]] = OpTypeVoid
 ; CHECK: %[[#iptr_ty:]] = OpTypePointer CrossWorkgroup  %[[#int_ty]]
@@ -26,14 +23,13 @@
 ; CHECK-NOT: %[[#vec_ty]] = OpFunction
 ; CHECK-NOT: %[[#func2_ty]] = OpFunction
 ; CHECK-NOT: %[[#f2_decl]] = OpFunction
-; CHECK: %[[#outOffsets]] = OpFunctionParameter %[[#iptr_ty]]
 
 define spir_kernel void @test(i32 addrspace(1)* %outOffsets) {
 entry:
   %0 = call spir_func <3 x i64> @BuiltInGlobalOffset() #1
   %call = extractelement <3 x i64> %0, i32 0
   %conv = trunc i64 %call to i32
-; CHECK: %[[#i1:]] = OpInBoundsPtrAccessChain %[[#i32ptr_ty]] %[[#outOffsets]]
+; CHECK: %[[#i1:]] = OpInBoundsPtrAccessChain %[[#i32ptr_ty]] %[[#outOffsets:]]
 ; CHECK: OpStore %[[#i1:]] %[[#]] Aligned 4
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %outOffsets, i64 0
   store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
new file mode 100644
index 0000000000000..9e136ce887468
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
@@ -0,0 +1,19 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+
+; CHECK-DAG: %[[#IMAGE:]] = OpTypeImage %2 2D 0 0 0 0 Unknown ReadOnly
+
+; CHECK: %[[#PARAM:]] = OpFunctionParameter %[[#IMAGE:]]
+; CHECK-NOT: OpBitcast
+; CHECK: %[[#]] = OpImageQuerySizeLod %[[#]] %[[#PARAM]] %[[#]]
+
+define spir_kernel void @test(ptr addrspace(1) %srcimg) #0 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_type_qual !4 !kernel_arg_base_type !3 {
+  %call = call spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(ptr addrspace(1) %srcimg)
+  ret void
+}
+
+declare spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(ptr addrspace(1))
+
+!1 = !{i32 1}
+!2 = !{!"read_only"}
+!3 = !{!"image2d_t"}
+!4 = !{!""}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-users.ll b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-users.ll
new file mode 100644
index 0000000000000..f4ea710fa0431
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-users.ll
@@ -0,0 +1,19 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+
+; CHECK-DAG: %[[#CHAR:]] = OpTypeInt 8
+; CHECK-DAG: %[[#INT:]] = OpTypeInt 32
+; CHECK-DAG: %[[#GLOBAL_PTR_INT:]] = OpTypePointer CrossWorkgroup %[[#INT]]
+; CHECK-DAG: %[[#GLOBAL_PTR_CHAR:]] = OpTypePointer CrossWorkgroup %[[#CHAR]]
+
+define i32 @foo(i32 %a, ptr addrspace(1) %p) {
+  store i32 %a, i32 addrspace(1)* %p
+  %b = load i32, i32 addrspace(1)* %p
+  ret i32 %b
+}
+
+; CHECK: %[[#A:]] = OpFunctionParameter %[[#INT]]
+; CHECK: %[[#P:]] = OpFunctionParameter %[[#GLOBAL_PTR_CHAR]]
+; CHECK: %[[#C:]] = OpBitcast %[[#GLOBAL_PTR_INT]] %[[#P]]
+; CHECK: OpStore %[[#C]] %[[#A]]
+; CHECK: %[[#B:]] = OpLoad %[[#INT]] %[[#C]]
+; CHECK-NOT: %[[#B:]] = OpLoad %[[#INT]] %[[#P]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll
new file mode 100644
index 0000000000000..8998329ea64fe
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll
@@ -0,0 +1,17 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+
+; CHECK-DAG: %[[#float:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#pointer:]] = OpTypePointer CrossWorkgroup %[[#float]]
+; CHECK: %[[#A:]] = OpFunctionParameter %[[#]]
+
+define void @foo(float addrspace(1)* %A, i32 %B) {
+  %cmp = icmp sgt i32 %B, 0
+  %conv = uitofp i1 %cmp to float
+; CHECK: %[[#utof_res:]] = OpConvertUToF %[[#float]] %[[#]]
+; CHECK: %[[#bitcast:]] = OpBitcast %[[#pointer]] %[[#A]]
+; CHECK: OpStore %[[#bitcast]] %[[#utof_res]]
+  %BC1 = bitcast float addrspace(1)* %A to i32 addrspace(1)*
+  %BC2 = bitcast i32 addrspace(1)* %BC1 to float addrspace(1)*
+  store float %conv, float addrspace(1)* %BC2, align 4;
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/sitofp-with-bool.ll b/llvm/test/CodeGen/SPIRV/sitofp-with-bool.ll
index 4dda0aadced49..06f27e2cb1f90 100644
--- a/llvm/test/CodeGen/SPIRV/sitofp-with-bool.ll
+++ b/llvm/test/CodeGen/SPIRV/sitofp-with-bool.ll
@@ -1,9 +1,11 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
 
-; CHECK: %[[#int_32:]] = OpTypeInt 32 0
-; CHECK: %[[#bool:]] = OpTypeBool
-; CHECK: %[[#zero:]] = OpConstant %[[#int_32]] 0
-; CHECK: %[[#one:]] = OpConstant %[[#int_32]] 1
+; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#float:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#bool:]] = OpTypeBool
+; CHECK-DAG: %[[#zero:]] = OpConstant %[[#int_32]] 0
+; CHECK-DAG: %[[#one:]] = OpConstant %[[#int_32]] 1
+; CHECK-DAG: %[[#ptr:]] = OpTypePointer CrossWorkgroup %[[#float]]
 
 ; CHECK: OpFunction
 ; CHECK: %[[#A:]] = OpFunctionParameter %[[#]]
@@ -11,9 +13,10 @@
 ; CHECK: %[[#cmp_res:]] = OpSGreaterThan %[[#bool]] %[[#B]] %[[#zero]]
 ; CHECK: %[[#select_res:]] = OpSelect %[[#int_32]] %[[#cmp_res]] %[[#one]] %[[#zero]]
 ; CHECK: %[[#stof_res:]] = OpConvertSToF %[[#]] %[[#select_res]]
-; CHECK: OpStore %[[#A]] %[[#stof_res]]
+; CHECK: %[[#bitcast:]] = OpBitcast %[[#ptr]] %[[#A]]
+; CHECK: OpStore %[[#bitcast]] %[[#stof_res]]
 
-define dso_local spir_kernel void @K(float addrspace(1)* nocapture %A, i32 %B) local_unnamed_addr {
+define dso_local spir_kernel void @K(ptr addrspace(1) nocapture %A, i32 %B) local_unnamed_addr {
 entry:
   %cmp = icmp sgt i32 %B, 0
   %conv = sitofp i1 %cmp to float
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
index 9252e264cec8e..c98fef3631e04 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
@@ -13,6 +13,7 @@
 ;; }
 
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; XFAIL: *
 
 %struct.Node = type { %struct.Node.0 addrspace(1)* }
 %struct.Node.0 = type opaque
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spec_const.ll b/llvm/test/CodeGen/SPIRV/transcoding/spec_const.ll
index 9fd7460885241..c47dccb35e14d 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/spec_const.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/spec_const.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; XFAIL: *
 
 ; CHECK-SPIRV-NOT: OpCapability Matrix
 ; CHECK-SPIRV-NOT: OpCapability Shader
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
index 3c45fdd0481dc..a07af76c22324 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
@@ -19,10 +19,8 @@
 
 ; CHECK-SPIRV:     %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function
 ; CHECK-SPIRV:     %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function
-; CHECK-SPIRV:     %[[#arr_i8_ptr:]] = OpBitcast %[[#i8_ptr]] %[[#arr]]
-; CHECK-SPIRV:     OpCopyMemorySized %[[#arr_i8_ptr]] %[[#test_arr]] %[[#twelve]] Aligned 4
-; CHECK-SPIRV:     %[[#arr2_i8_ptr:]] = OpBitcast %[[#i8_ptr]] %[[#arr2]]
-; CHECK-SPIRV:     OpCopyMemorySized %[[#arr2_i8_ptr]] %[[#test_arr2]] %[[#twelve]] Aligned 4
+; CHECK-SPIRV:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelve]] Aligned 4
+; CHECK-SPIRV:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelve]] Aligned 4
 
 
 @__const.test.arr = private unnamed_addr addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3], align 4
diff --git a/llvm/test/CodeGen/SPIRV/uitofp-with-bool.ll b/llvm/test/CodeGen/SPIRV/uitofp-with-bool.ll
index 2fe0dc91e7b4c..8a3c2853d9914 100644
--- a/llvm/test/CodeGen/SPIRV/uitofp-with-bool.ll
+++ b/llvm/test/CodeGen/SPIRV/uitofp-with-bool.ll
@@ -66,6 +66,7 @@
 ; SPV-DAG: %[[#ones_16:]] = OpConstantComposite %[[#vec_16]] %[[#one_16]] %[[#one_16]]
 ; SPV-DAG: %[[#ones_32:]] = OpConstantComposite %[[#vec_32]] %[[#one_32]] %[[#one_32]]
 ; SPV-DAG: %[[#ones_64:]] = OpConstantComposite %[[#vec_64]] %[[#one_64]] %[[#one_64]]
+; SPV-DAG: %[[#pointer:]] = OpTypePointer CrossWorkgroup %[[#float]]
 
 ; SPV-DAG: OpFunction
 ; SPV-DAG: %[[#A:]] = OpFunctionParameter %[[#]]
@@ -81,7 +82,8 @@ entry:
 ; SPV-DAG: %[[#select_res:]] = OpSelect %[[#int_32]] %[[#cmp_res]] %[[#one_32]] %[[#zero_32]]
 ; SPV-DAG: %[[#utof_res:]] = OpConvertUToF %[[#float]] %[[#select_res]]
   %conv = uitofp i1 %cmp to float
-; SPV-DAG: OpStore %[[#A]] %[[#utof_res]]
+; SPV-DAG: %[[#bitcast:]] = OpBitcast %[[#pointer]] %[[#A]]
+; SPV-DAG: OpStore %[[#bitcast]] %[[#utof_res]]
   store float %conv, float addrspace(1)* %A, align 4;
 
 ; SPV-DAG: %[[#s1]] = OpSelect %[[#int_8]] %[[#i1s]] %[[#mone_8]] %[[#zero_8]]

From f5fd1836836e0d37dea61cc842199713cc0e2fc4 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 5 Jan 2024 11:16:53 +0800
Subject: [PATCH 308/313] [NFC] [C++20] [Modules] Remove pr60085.cppm with
 deprecated practice

See https://github.com/llvm/llvm-project/issues/60085 for the complete
story.

Previously I thought the problem got fixed surprisingly. But it is not
true. I just tested it with a deprecated method. My bad. Then the
deprecated style should be removed and the proper style can't work. So
I'll remove the test and reopen that issue to look into it.
---
 clang/test/Modules/pr60085.cppm | 98 ---------------------------------
 1 file changed, 98 deletions(-)
 delete mode 100644 clang/test/Modules/pr60085.cppm

diff --git a/clang/test/Modules/pr60085.cppm b/clang/test/Modules/pr60085.cppm
deleted file mode 100644
index fba6012064047..0000000000000
--- a/clang/test/Modules/pr60085.cppm
+++ /dev/null
@@ -1,98 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir %t
-// RUN: split-file %s %t
-//
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/d.cppm \
-// RUN:     -emit-module-interface -o %t/d.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/c.cppm \
-// RUN:     -emit-module-interface -o %t/c.pcm -fmodule-file=%t/d.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.cppm \
-// RUN:     -emit-module-interface -o %t/b.pcm -fmodule-file=%t/d.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm \
-// RUN:     -emit-module-interface -o %t/a.pcm -fmodule-file=%t/d.pcm \
-// RUN:     -fmodule-file=%t/c.pcm -fmodule-file=%t/b.pcm 
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.pcm \
-// RUN:     -S -emit-llvm -disable-llvm-passes -o - | FileCheck %t/a.cppm
-//
-// Use -fmodule-file=<module-name>=<BMI-path>
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/d.cppm \
-// RUN:     -emit-module-interface -o %t/d.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/c.cppm \
-// RUN:     -emit-module-interface -o %t/c.pcm -fmodule-file=%t/d.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.cppm \
-// RUN:     -emit-module-interface -o %t/b.pcm -fmodule-file=%t/d.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm \
-// RUN:     -emit-module-interface -o %t/a.pcm -fmodule-file=%t/d.pcm \
-// RUN:     -fmodule-file=%t/c.pcm -fmodule-file=%t/b.pcm 
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.pcm \
-// RUN:     -S -emit-llvm -disable-llvm-passes -o - | FileCheck %t/a.cppm
-
-//--- d.cppm
-export module d;
-
-export template<typename>
-struct integer {
-	using type = int;
-	
-	static constexpr auto value() {
-		return 0;
-	}
-	
-	friend constexpr void f(integer const x) {
-		x.value();
-	}
-};
-
-export constexpr void ddd(auto const value) {
-	f(value);
-}
-
-
-template<typename T>
-constexpr auto dd = T();
-
-export template<typename T>
-constexpr auto d() {
-	dd<T>;
-}
-
-//--- c.cppm
-export module c;
-
-import d;
-
-template<typename T>
-auto cc = T();
-
-auto c() {
-	cc<integer<int>>;
-	integer<int>().value();
-}
-
-//--- b.cppm
-export module b;
-
-import d;
-
-auto b() {
-	integer<int>::type;
-}
-
-//--- a.cppm
-export module a;
-
-import b;
-import c;
-import d;
-
-constexpr void aa() {
-	d<integer<unsigned>>();
-	ddd(integer<int>());
-}
-
-export extern "C" void a() {
-	aa();
-}
-
-// Checks that we emit the IR successfully.
-// CHECK: define{{.*}}@a(

From 054b5fc0fd41bcbadcc6967c39a5f6bb151bdcd1 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Thu, 4 Jan 2024 22:39:40 -0500
Subject: [PATCH 309/313] X86: add some missing lowerings for shuffles on
 `bf16` element type. (#76076)

Some shuffles with `bf16` as element type were running into a
`llvm_unreachable`. Key to reproducing was to chain two shuffles.

```llvm
define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
  %s = shufflevector <32 x bfloat> %a, <32 x bfloat> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
  %s2 = shufflevector <32 x bfloat> %s, <32 x bfloat> zeroinitializer, <2 x i32> <i32 0, i32 1>
  ret <2 x bfloat> %s2
}
```

This was hitting this UNREACHABLE:

```
Not a valid 512-bit x86 vector type!
UNREACHABLE executed at /home/benoit/iree/third_party/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp:17124!
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.      Program arguments: /home/benoit/mlir-build/bin/llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512bf16
1.      Running pass 'Function Pass Manager' on module '<stdin>'.
2.      Running pass 'X86 DAG->DAG Instruction Selection' on function '@shuffle_chained_v32bf16_v2bf16'
```
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 11 +++-
 .../avx512-shuffles/shuffle-chained-bf16.ll   | 63 +++++++++++++++++++
 2 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fe3ba2ae29179..a72b14c2e2aae 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -14369,6 +14369,13 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                   const APInt &Zeroable,
                                   const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
+  if (VT == MVT::v8bf16) {
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = DAG.getBitcast(MVT::v8i16, V2);
+    return DAG.getBitcast(VT,
+                          DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
+  }
+
   switch (VT.SimpleTy) {
   case MVT::v2i64:
     return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
@@ -17096,14 +17103,14 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
   }
 
-  if (VT == MVT::v32f16) {
+  if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
     if (!Subtarget.hasBWI())
       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
                                   /*SimpleOnly*/ false);
 
     V1 = DAG.getBitcast(MVT::v32i16, V1);
     V2 = DAG.getBitcast(MVT::v32i16, V2);
-    return DAG.getBitcast(MVT::v32f16,
+    return DAG.getBitcast(VT,
                           DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
   }
 
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
new file mode 100644
index 0000000000000..99d6049fc1d86
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512bf16 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
+; CHECK-LABEL: shuffle_chained_v32bf16_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    andq $-64, %rsp
+; CHECK-NEXT:    subq $128, %rsp
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsp)
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %s = shufflevector <32 x bfloat> %a, <32 x bfloat> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %s2 = shufflevector <32 x bfloat> %s, <32 x bfloat> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x bfloat> %s2
+}
+
+define <2 x bfloat> @shuffle_chained_v16bf16(<16 x bfloat> %a) {
+; CHECK-LABEL: shuffle_chained_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    andq $-32, %rsp
+; CHECK-NEXT:    subq $96, %rsp
+; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
+; CHECK-NEXT:    vmovdqa (%rsp), %xmm0
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %s = shufflevector <16 x bfloat> %a, <16 x bfloat> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %s2 = shufflevector <16 x bfloat> %s, <16 x bfloat> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x bfloat> %s2
+}
+
+define <2 x bfloat> @shuffle_chained_v8bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: shuffle_chained_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15]
+; CHECK-NEXT:    retq
+  %s = shufflevector <8 x bfloat> %a, <8 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %s2 = shufflevector <8 x bfloat> %s, <8 x bfloat> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x bfloat> %s2
+}

From 52d1397e38ee88b170585c9c824d08e6975890ca Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Fri, 5 Jan 2024 12:05:23 +0800
Subject: [PATCH 310/313] [LoongArch] Fix -Wunused-variable in
 LoongArchExpandPseudoInsts.cpp (NFC)

llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp:480:20:
 error: unused variable 'MF' [-Werror,-Wunused-variable]
  MachineFunction *MF = MBB.getParent();
                   ^
1 error generated.
---
 llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index f977f176066a5..ad39658f698e7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -477,12 +477,11 @@ bool LoongArchExpandPseudo::expandLargeAddressLoad(
     break;
   }
 
-  MachineFunction *MF = MBB.getParent();
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   Register ScratchReg = LoongArch::R20; // $t8
 
-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
          "Large code model requires LA64");
 
   auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);

From 16dc82122bee915d122a68a4f1680ab810012906 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 4 Jan 2024 21:46:02 -0800
Subject: [PATCH 311/313] [RISCV] Remove isGPRF64AsFPR and isGPRPF64AsFPR
 functions from AsmParser. NFC

These are identical to isGPRAsFPR. By overriding the PredicateMethod
on the AsmOperands in tblgen we can share a single function.
---
 llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 4 ----
 llvm/lib/Target/RISCV/RISCVInstrInfoD.td           | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4759aa951664c..23c2b63c8c832 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -466,10 +466,6 @@ struct RISCVOperand final : public MCParsedAsmOperand {
 
   bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
 
-  bool isGPRF64AsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
-
-  bool isGPRPF64AsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
-
   static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
                                   RISCVMCExpr::VariantKind &VK) {
     if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 6af710049a9dd..418421b2a556f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -36,11 +36,13 @@ def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmINX">;
 def GPRPF64AsFPR : AsmOperandClass {
   let Name = "GPRPF64AsFPR";
   let ParserMethod = "parseGPRAsFPR";
+  let PredicateMethod = "isGPRAsFPR";
   let RenderMethod = "addRegOperands";
 }
 
 def GPRF64AsFPR : AsmOperandClass {
   let Name = "GPRF64AsFPR";
+  let PredicateMethod = "isGPRAsFPR";
   let ParserMethod = "parseGPRAsFPR";
   let RenderMethod = "addRegOperands";
 }

From 668165002543fd3a88413a5c2601774395bfd10e Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 5 Jan 2024 14:39:16 +0800
Subject: [PATCH 312/313] [InstCombine] Revert the `signed icmp -> unsigned
 icmp` canonicalization when folding `icmp Pred min|max(X, Y), Z` (#76685)

This patch tries to flip the signedness of predicates when folding an
unsigned icmp with a signed min/max. It will enable more optimizations
as we canonicalizes a signed icmp into an unsigned icmp when both
operands are known to have the same sign.
Fixes #76672.

Compile-time impact:
http://llvm-compile-time-tracker.com/compare.php?from=949ec83eaf6fa6dbffb94c2ea9c0a4d5efdbd239&to=2deca1aea8a4e13609bab72c522a97d424f0fc2d&stat=instructions:u


|stage1-O3|stage1-ReleaseThinLTO|stage1-ReleaseLTO-g|stage1-O0-g|stage2-O3|stage2-O0-g|stage2-clang|
|--|--|--|--|--|--|--|
|-0.00%|+0.01%|+0.05%|-0.12%|-0.01%|-0.03%|-0.00%|

NOTE: We can flip the signedness of predicate if both operands are
negative. But I don't see the benefit of handling these cases.
---
 .../InstCombine/InstCombineCompares.cpp       | 12 ++-
 llvm/test/Transforms/InstCombine/smax-icmp.ll | 41 +++++++++
 .../LoopVectorize/X86/float-induction-x86.ll  |  4 +-
 .../X86/invariant-load-gather.ll              | 43 +++++-----
 .../X86/invariant-store-vectorization.ll      | 63 +++++++-------
 .../LoopVectorize/float-induction.ll          |  8 +-
 .../Transforms/LoopVectorize/induction.ll     | 38 ++++-----
 .../interleaved-accesses-pred-stores.ll       | 83 +++++++++----------
 .../LoopVectorize/interleaved-accesses.ll     | 30 +++----
 .../invariant-store-vectorization-2.ll        | 15 ++--
 .../invariant-store-vectorization.ll          | 37 ++++-----
 .../Transforms/LoopVectorize/loop-scalars.ll  |  8 +-
 .../Transforms/LoopVectorize/reduction.ll     |  4 +-
 .../Transforms/LoopVectorize/vector-geps.ll   | 14 ++--
 14 files changed, 215 insertions(+), 185 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 3875e59c3ede3..2dc0b0b87f60a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5047,8 +5047,16 @@ Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
   Value *Y = MinMax->getRHS();
   if (ICmpInst::isSigned(Pred) && !MinMax->isSigned())
     return nullptr;
-  if (ICmpInst::isUnsigned(Pred) && MinMax->isSigned())
-    return nullptr;
+  if (ICmpInst::isUnsigned(Pred) && MinMax->isSigned()) {
+    // Revert the transform signed pred -> unsigned pred
+    // TODO: We can flip the signedness of predicate if both operands of icmp
+    // are negative.
+    if (isKnownNonNegative(Z, SQ.getWithInstruction(&I)) &&
+        isKnownNonNegative(MinMax, SQ.getWithInstruction(&I))) {
+      Pred = ICmpInst::getFlippedSignednessPredicate(Pred);
+    } else
+      return nullptr;
+  }
   SimplifyQuery Q = SQ.getWithInstruction(&I);
   auto IsCondKnownTrue = [](Value *Val) -> std::optional<bool> {
     if (!Val)
diff --git a/llvm/test/Transforms/InstCombine/smax-icmp.ll b/llvm/test/Transforms/InstCombine/smax-icmp.ll
index 0ccbef34889fd..022ec6ad4f346 100644
--- a/llvm/test/Transforms/InstCombine/smax-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/smax-icmp.ll
@@ -804,4 +804,45 @@ end:
   ret void
 }
 
+; Tests from PR76672
+
+define i1 @test_smax_ugt(i32 %a) {
+; CHECK-LABEL: @test_smax_ugt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cond.i = call i32 @llvm.smax.i32(i32 %a, i32 0)
+  %cmp = icmp ugt i32 %cond.i, 1
+  ret i1 %cmp
+}
+
+; Negative tests
+
+define i1 @test_smax_ugt_neg1(i32 %a) {
+; CHECK-LABEL: @test_smax_ugt_neg1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cond.i = call i32 @llvm.smax.i32(i32 %a, i32 0)
+  %cmp = icmp ugt i32 %cond.i, -5
+  ret i1 %cmp
+}
+
+define i1 @test_smax_ugt_neg2(i32 %a) {
+; CHECK-LABEL: @test_smax_ugt_neg2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND_I:%.*]] = call i32 @llvm.smax.i32(i32 [[A:%.*]], i32 -5)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[COND_I]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cond.i = call i32 @llvm.smax.i32(i32 %a, i32 -5)
+  %cmp = icmp ugt i32 %cond.i, 1
+  ret i1 %cmp
+}
+
+
 declare i32 @llvm.smax.i32(i32, i32)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index c8c791b301633..88186584c2053 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -197,7 +197,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-LABEL: @external_use_with_fast_math(
 ; AUTO_VEC-NEXT:  entry:
 ; AUTO_VEC-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; AUTO_VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; AUTO_VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 16
 ; AUTO_VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
@@ -264,7 +264,7 @@ define double @external_use_without_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-NEXT:  entry:
 ; AUTO_VEC-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; AUTO_VEC-NEXT:    [[XTRAITER:%.*]] = and i64 [[SMAX]], 7
-; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[SMAX]], 8
+; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[N]], 8
 ; AUTO_VEC-NEXT:    br i1 [[TMP0]], label [[FOR_END_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
 ; AUTO_VEC:       entry.new:
 ; AUTO_VEC-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[SMAX]], 9223372036854775800
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index 9b432de35c9d1..8783326b1ef1a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -9,19 +9,18 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp slt i64 [[N]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775792
@@ -33,13 +32,13 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x ptr> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison), !alias.scope !3
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison), !alias.scope [[META3]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i64 15
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
@@ -51,23 +50,23 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_VEC7:%.*]] = and i64 [[SMAX2]], 9223372036854775800
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT11]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT13]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT10]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT12]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX9]]
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT14]], ptr [[TMP5]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT:    [[INDEX_NEXT17]] = add nuw i64 [[INDEX9]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC7]]
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT13]], ptr [[TMP5]], align 4, !alias.scope [[META8:![0-9]+]], !noalias [[META11:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT16]] = add nuw i64 [[INDEX9]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC7]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x ptr> [[BROADCAST_SPLAT12]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[BROADCAST_SPLAT12]], i32 4, <8 x i1> [[TMP7]], <8 x i32> poison), !alias.scope !11
-; CHECK-NEXT:    [[PREDPHI16:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[WIDE_MASKED_GATHER15]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[PREDPHI16]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x ptr> [[BROADCAST_SPLAT11]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[BROADCAST_SPLAT11]], i32 4, <8 x i1> [[TMP7]], <8 x i32> poison), !alias.scope [[META11]]
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[WIDE_MASKED_GATHER14]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[PREDPHI15]], i64 7
 ; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC7]]
 ; CHECK-NEXT:    br i1 [[CMP_N8]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 292ab4e4b2c4d..330acb008d0dd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -12,19 +12,18 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b)
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp slt i64 [[N]], 64
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775744
@@ -39,15 +38,15 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 32
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 48
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8, !alias.scope [[META0]]
 ; CHECK-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
 ; CHECK-NEXT:    [[TMP7]] = add <16 x i32> [[VEC_PHI5]], [[WIDE_LOAD8]]
 ; CHECK-NEXT:    [[TMP8]] = add <16 x i32> [[VEC_PHI6]], [[WIDE_LOAD9]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -72,9 +71,9 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b)
 ; CHECK-NEXT:    [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <8 x i32> [ [[TMP11]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX15]]
-; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope !8
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope [[META8:![0-9]+]]
 ; CHECK-NEXT:    [[TMP13]] = add <8 x i32> [[VEC_PHI16]], [[WIDE_LOAD17]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !11, !noalias !8
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META8]]
 ; CHECK-NEXT:    [[INDEX_NEXT18]] = add nuw i64 [[INDEX15]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC13]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -127,11 +126,10 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]]
@@ -139,7 +137,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp slt i64 [[N]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775792
@@ -153,10 +151,10 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META18:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope !15, !noalias !18
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[BROADCAST_SPLAT5]], <16 x ptr> [[BROADCAST_SPLAT7]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !18
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope [[META15]], !noalias [[META18]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[BROADCAST_SPLAT5]], <16 x ptr> [[BROADCAST_SPLAT7]], i32 4, <16 x i1> [[TMP2]]), !alias.scope [[META18]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -180,10 +178,10 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP4]], align 8, !alias.scope !21, !noalias !24
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP4]], align 8, !alias.scope [[META21:![0-9]+]], !noalias [[META24:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD12]], [[BROADCAST_SPLAT14]]
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT16]], ptr [[TMP4]], align 4, !alias.scope !21, !noalias !24
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[BROADCAST_SPLAT16]], <8 x ptr> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP5]]), !alias.scope !24
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT16]], ptr [[TMP4]], align 4, !alias.scope [[META21]], !noalias [[META24]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[BROADCAST_SPLAT16]], <8 x ptr> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP5]]), !alias.scope [[META24]]
 ; CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC9]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
@@ -240,11 +238,10 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[SMAX10:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX10]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP0]]
@@ -261,7 +258,7 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX9]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK11:%.*]] = icmp ult i64 [[SMAX10]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK11:%.*]] = icmp slt i64 [[N]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK11]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX10]], 9223372036854775792
@@ -275,12 +272,12 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !28, !noalias !31
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META28:![0-9]+]], !noalias [[META31:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT13]], ptr [[TMP1]], align 4, !alias.scope !28, !noalias !31
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT13]], ptr [[TMP1]], align 4, !alias.scope [[META28]], !noalias [[META31]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope !34
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x ptr> [[BROADCAST_SPLAT15]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !35, !noalias !34
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope [[META34:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x ptr> [[BROADCAST_SPLAT15]], i32 4, <16 x i1> [[TMP2]]), !alias.scope [[META35:![0-9]+]], !noalias [[META34]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
@@ -304,12 +301,12 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX19]]
-; CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, ptr [[TMP5]], align 8, !alias.scope !37, !noalias !40
+; CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, ptr [[TMP5]], align 8, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD20]], [[BROADCAST_SPLAT22]]
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT24]], ptr [[TMP5]], align 4, !alias.scope !37, !noalias !40
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT24]], ptr [[TMP5]], align 4, !alias.scope [[META37]], !noalias [[META40]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX19]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope !43
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD25]], <8 x ptr> [[BROADCAST_SPLAT27]], i32 4, <8 x i1> [[TMP6]]), !alias.scope !44, !noalias !43
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META43:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD25]], <8 x ptr> [[BROADCAST_SPLAT27]], i32 4, <8 x i1> [[TMP6]]), !alias.scope [[META44:![0-9]+]], !noalias [[META43]]
 ; CHECK-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX19]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC17]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 24c52b7049525..736b0598043e6 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -1315,7 +1315,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC4_INTERL1-LABEL: @non_primary_iv_float_scalar(
 ; VEC4_INTERL1-NEXT:  entry:
 ; VEC4_INTERL1-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; VEC4_INTERL1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; VEC4_INTERL1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
@@ -1392,7 +1392,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC4_INTERL2-LABEL: @non_primary_iv_float_scalar(
 ; VEC4_INTERL2-NEXT:  entry:
 ; VEC4_INTERL2-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; VEC4_INTERL2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 8
+; VEC4_INTERL2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775800
@@ -1508,7 +1508,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC1_INTERL2-LABEL: @non_primary_iv_float_scalar(
 ; VEC1_INTERL2-NEXT:  entry:
 ; VEC1_INTERL2-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; VEC1_INTERL2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; VEC1_INTERL2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 2
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
@@ -1566,7 +1566,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar(
 ; VEC2_INTERL1_PRED_STORE-NEXT:  entry:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 2
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index d73193392e393..119c0aefaa73f 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -670,7 +670,7 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; IND-LABEL: @scalarize_induction_variable_01(
 ; IND-NEXT:  entry:
 ; IND-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 2
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
@@ -708,7 +708,7 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; UNROLL-LABEL: @scalarize_induction_variable_01(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
@@ -799,7 +799,7 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; INTERLEAVE-LABEL: @scalarize_induction_variable_01(
 ; INTERLEAVE-NEXT:  entry:
 ; INTERLEAVE-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 8
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775800
@@ -937,7 +937,7 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; IND-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; IND-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 3
 ; IND-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 9
+; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 9
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 4611686018427387902
@@ -997,7 +997,7 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; UNROLL-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 3
 ; UNROLL-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 25
+; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 25
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 4611686018427387900
@@ -1153,11 +1153,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ;
 ; INTERLEAVE-LABEL: @scalarize_induction_variable_02(
 ; INTERLEAVE-NEXT:  entry:
-; INTERLEAVE-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 8)
-; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 65
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 65
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
-; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -1
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 3
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 7
@@ -1298,7 +1297,7 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; IND-LABEL: @scalarize_induction_variable_03(
 ; IND-NEXT:  entry:
 ; IND-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 2
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
@@ -1343,7 +1342,7 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-LABEL: @scalarize_induction_variable_03(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
@@ -1460,14 +1459,13 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ;
 ; INTERLEAVE-LABEL: @scalarize_induction_variable_03(
 ; INTERLEAVE-NEXT:  entry:
-; INTERLEAVE-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 9
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 9
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
-; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 7
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 7
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 8, i64 [[N_MOD_VF]]
-; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2036,7 +2034,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND-LABEL: @scalarize_induction_variable_05(
 ; IND-NEXT:  entry:
 ; IND-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 1)
-; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SMAX]], 2
+; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i32 [[N]], 2
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483646
@@ -2103,7 +2101,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-LABEL: @scalarize_induction_variable_05(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 1)
-; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SMAX]], 4
+; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i32 [[N]], 4
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644
@@ -2291,7 +2289,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-LABEL: @scalarize_induction_variable_05(
 ; INTERLEAVE-NEXT:  entry:
 ; INTERLEAVE-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 1)
-; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SMAX]], 8
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i32 [[N]], 8
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640
@@ -4921,7 +4919,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; IND-LABEL: @non_primary_iv_trunc(
 ; IND-NEXT:  entry:
 ; IND-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 2
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
@@ -4959,7 +4957,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; UNROLL-LABEL: @non_primary_iv_trunc(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
@@ -5046,7 +5044,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; INTERLEAVE-LABEL: @non_primary_iv_trunc(
 ; INTERLEAVE-NEXT:  entry:
 ; INTERLEAVE-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 8
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775800
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 7e647b8837527..86ca1222aa4ea 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -17,14 +17,13 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 define void @interleaved_with_cond_store_0(ptr %p, i64 %x, i64 %n) {
 ; CHECK-LABEL: @interleaved_with_cond_store_0(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 3
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -33,16 +32,16 @@ define void @interleaved_with_cond_store_0(ptr %p, i64 %x, i64 %n) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i64 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP2_1:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP2_1]], align 8
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.if1:
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -71,7 +70,7 @@ define void @interleaved_with_cond_store_0(ptr %p, i64 %x, i64 %n) {
 ; CHECK:       if.merge:
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -112,14 +111,13 @@ for.end:
 define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) {
 ; CHECK-LABEL: @interleaved_with_cond_store_1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 3
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -131,16 +129,16 @@ define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[TMP2]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i64 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[PTR0]], align 8
+; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.if1:
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[TMP2]], i32 0
@@ -149,13 +147,13 @@ define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) {
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 2
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 0
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 2
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -165,15 +163,15 @@ define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[IF_MERGE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[P_0:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[I]], i32 0
 ; CHECK-NEXT:    [[P_1:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[I]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[P_1]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], [[X]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[IF_THEN:%.*]], label [[IF_MERGE]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[P_1]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[TMP16]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[IF_THEN:%.*]], label [[IF_MERGE]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    store i64 [[TMP17]], ptr [[P_0]], align 8
+; CHECK-NEXT:    store i64 [[TMP16]], ptr [[P_0]], align 8
 ; CHECK-NEXT:    br label [[IF_MERGE]]
 ; CHECK:       if.merge:
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[P_0]], align 8
-; CHECK-NEXT:    store i64 [[TMP19]], ptr [[P_1]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[P_0]], align 8
+; CHECK-NEXT:    store i64 [[TMP18]], ptr [[P_1]], align 8
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -219,14 +217,13 @@ for.end:
 define void @interleaved_with_cond_store_2(ptr %p, i64 %x, i64 %n) {
 ; CHECK-LABEL: @interleaved_with_cond_store_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 3
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -240,16 +237,16 @@ define void @interleaved_with_cond_store_2(ptr %p, i64 %x, i64 %n) {
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    store i64 [[X]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    store i64 [[X]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i64 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[INDEX]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[PTR1]], align 8
+; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.if1:
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P]], i64 [[TMP2]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 3b9edb11c5da2..9a27c294f1384 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -896,14 +896,13 @@ for.body:                                         ; preds = %for.body, %entry
 define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
 ; CHECK-LABEL: @PR27626_0(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 5
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -984,14 +983,13 @@ for.end:
 define i32 @PR27626_1(ptr %p, i64 %n) {
 ; CHECK-LABEL: @PR27626_1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 5
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1075,14 +1073,13 @@ for.end:
 define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
 ; CHECK-LABEL: @PR27626_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 5
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1167,14 +1164,13 @@ for.end:
 define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
 ; CHECK-LABEL: @PR27626_3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 5
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1274,7 +1270,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 7
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 7
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
index 1652a8e4ca37d..50c67040cfb2a 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
@@ -22,11 +22,10 @@ define void @inv_val_store_to_inv_address_conditional_diff_values_ic(ptr %a, i64
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]]
@@ -123,11 +122,10 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]]
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]]
@@ -213,12 +211,11 @@ define i32 @variant_val_store_to_inv_address(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-LABEL: @variant_val_store_to_inv_address(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index c4d7ef8e949b0..20d612a548b15 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -21,12 +21,11 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]]
@@ -39,9 +38,9 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -96,12 +95,11 @@ define void @inv_val_store_to_inv_address(ptr %a, i64 %n, ptr %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]]
@@ -115,8 +113,8 @@ define void @inv_val_store_to_inv_address(ptr %a, i64 %n, ptr %b) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !9, !noalias !12
-; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope !12
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]]
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META12]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -172,11 +170,10 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]]
@@ -193,31 +190,31 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !16, !noalias !19
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META16:![0-9]+]], !noalias [[META19:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT4]], ptr [[TMP1]], align 4, !alias.scope !16, !noalias !19
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT4]], ptr [[TMP1]], align 4, !alias.scope [[META16]], !noalias [[META19]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !19
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META19]]
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
 ; CHECK:       pred.store.if5:
-; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !19
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META19]]
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
 ; CHECK:       pred.store.if7:
-; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !19
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META19]]
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.if9:
-; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !19
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META19]]
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.continue10:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -391,7 +388,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP15]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[INVARIANT_GEP]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[GEP]], align 4, !alias.scope !23
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[GEP]], align 4, !alias.scope [[META23:![0-9]+]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP17]] = add <4 x i32> [[TMP16]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
index 24cbf7d0e3865..de298d20fc382 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -9,7 +9,7 @@ define void @vector_gep(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @vector_gep(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
@@ -65,7 +65,7 @@ define void @scalar_store(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806
@@ -125,7 +125,7 @@ define void @expansion(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806
@@ -182,7 +182,7 @@ define void @no_gep_or_bitcast(ptr noalias %a, i64 %n) {
 ; CHECK-LABEL: @no_gep_or_bitcast(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index 7c12eb1d4e59d..757b41db6a5b4 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -1201,7 +1201,7 @@ define i64 @reduction_with_phi_with_one_incoming_on_backedge(i16 %n, ptr %A) {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i16 @llvm.smax.i16(i16 [[N]], i16 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i16 [[SMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i16 [[TMP0]] to i32
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i16 [[SMAX]], 5
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i16 [[N]], 5
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP1]], 32764
@@ -1279,7 +1279,7 @@ define i64 @reduction_with_phi_with_two_incoming_on_backedge(i16 %n, ptr %A) {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i16 @llvm.smax.i16(i16 [[N]], i16 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i16 [[SMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i16 [[TMP0]] to i32
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i16 [[SMAX]], 5
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i16 [[N]], 5
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP1]], 32764
diff --git a/llvm/test/Transforms/LoopVectorize/vector-geps.ll b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
index 4f4b4f6600da4..ec0b4864ac3fe 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-geps.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
@@ -8,7 +8,7 @@ define void @vector_gep_stored(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @vector_gep_stored(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
@@ -21,8 +21,8 @@ define void @vector_gep_stored(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -36,7 +36,7 @@ define void @vector_gep_stored(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    store ptr [[VAR0]], ptr [[VAR1]], align 8
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -61,7 +61,7 @@ define void @uniform_vector_gep_stored(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @uniform_vector_gep_stored(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
@@ -74,8 +74,8 @@ define void @uniform_vector_gep_stored(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x ptr> [[DOTSPLAT]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]

From e0c554ad87d18dcbfcb9b6485d0da800ae1338d1 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27@gmail.com>
Date: Thu, 4 Jan 2024 22:47:56 -0800
Subject: [PATCH 313/313] =?UTF-8?q?Port=20CodeGenPrepare=20to=20new=20pass?=
 =?UTF-8?q?=20manager=20(and=20BasicBlockSectionsProfil=E2=80=A6=20(#75380?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port CodeGenPrepare to new pass manager and dependency
BasicBlockSectionsProfileReader
Fixes: #64560

Co-authored-by: Krishna-13-cyber <84722531+Krishna-13-cyber@users.noreply.github.com>
---
 .../CodeGen/BasicBlockSectionsProfileReader.h |  83 +++++++++---
 llvm/include/llvm/CodeGen/CodeGenPrepare.h    |  35 +++++
 llvm/include/llvm/CodeGen/Passes.h            |   4 +-
 llvm/include/llvm/InitializePasses.h          |   4 +-
 llvm/include/llvm/LinkAllPasses.h             |   2 +-
 llvm/lib/CodeGen/BasicBlockPathCloning.cpp    |  11 +-
 llvm/lib/CodeGen/BasicBlockSections.cpp       |   8 +-
 .../BasicBlockSectionsProfileReader.cpp       |  52 ++++++--
 llvm/lib/CodeGen/CodeGen.cpp                  |   2 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 124 ++++++++++++------
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   4 +-
 llvm/lib/Passes/PassBuilder.cpp               |   2 +
 llvm/lib/Passes/PassRegistry.def              |   2 +
 .../AArch64/aarch64-codegen-prepare-atp.ll    |   2 +-
 llvm/test/CodeGen/AArch64/and-sink.ll         |   4 +-
 .../CodeGen/AArch64/arm64-bitfield-extract.ll |   2 +-
 .../AArch64/arm64-codegen-prepare-extload.ll  |   6 +-
 .../test/CodeGen/AArch64/arm64_32-gep-sink.ll |   2 +-
 .../CodeGen/AArch64/cgp-trivial-phi-node.ll   |   2 +-
 llvm/test/CodeGen/AArch64/convertphitype.ll   |   2 +-
 .../AArch64/scalable-vector-promotion.ll      |   2 +-
 llvm/test/CodeGen/AArch64/sve-vscale.ll       |   2 +-
 .../CodeGen/AArch64/sve2-vscale-sinking.ll    |   2 +-
 .../AMDGPU/cgp-addressing-modes-flat.ll       |   8 +-
 .../AMDGPU/cgp-addressing-modes-gfx1030.ll    |   2 +-
 .../AMDGPU/cgp-addressing-modes-gfx908.ll     |   2 +-
 .../CodeGen/AMDGPU/cgp-addressing-modes.ll    |   8 +-
 llvm/test/CodeGen/ARM/vector-promotion.ll     |   4 +-
 .../Generic/addr-sink-call-multi-arg.ll       |   2 +-
 llvm/test/CodeGen/Generic/addr-use-count.ll   |   2 +-
 .../test/CodeGen/X86/callbr-codegenprepare.ll |   2 +-
 .../X86/codegen-prepare-addrmode-sext.ll      |   2 +-
 .../CodeGen/X86/codegen-prepare-extload.ll    |   6 +-
 llvm/test/CodeGen/X86/convertphitype.ll       |   2 +-
 .../CodeGen/X86/indirect-br-gep-unmerge.ll    |   2 +-
 llvm/test/CodeGen/X86/pr58538.ll              |   4 +-
 llvm/test/CodeGen/X86/tailcall-cgp-dup.ll     |   2 +-
 llvm/test/CodeGen/X86/tailcall-extract.ll     |   2 +-
 llvm/test/DebugInfo/ARM/salvage-debug-info.ll |   2 +-
 llvm/test/DebugInfo/X86/zextload.ll           |   2 +-
 llvm/test/Other/codegenprepare-and-debug.ll   |   2 +-
 .../AArch64/combine-address-mode.ll           |   2 +-
 .../CodeGenPrepare/AArch64/free-zext.ll       |   2 +-
 .../gather-scatter-opt-inseltpoison.ll        |   2 +-
 .../AArch64/gather-scatter-opt.ll             |   2 +-
 .../AArch64/overflow-intrinsics.ll            |   6 +-
 .../AArch64/sink-gather-scatter-addressing.ll |   2 +-
 .../AArch64/trunc-weird-user.ll               |   2 +-
 .../CodeGenPrepare/AArch64/zext-to-shuffle.ll |   2 +-
 .../CodeGenPrepare/AMDGPU/addressing-modes.ll |   2 +-
 .../AMDGPU/no-sink-addrspacecast.ll           |   2 +-
 .../AMDGPU/sink-addrspacecast.ll              |   2 +-
 .../CodeGenPrepare/ARM/branch-on-zero.ll      |   2 +-
 .../Transforms/CodeGenPrepare/ARM/dead-gep.ll |   2 +-
 .../CodeGenPrepare/ARM/memory-intrinsics.ll   |   2 +-
 .../CodeGenPrepare/ARM/overflow-intrinsics.ll |   2 +-
 .../CodeGenPrepare/ARM/sink-addrmode.ll       |   2 +-
 .../Transforms/CodeGenPrepare/ARM/splitgep.ll |   2 +-
 .../CodeGenPrepare/ARM/tailcall-dup.ll        |   2 +-
 .../bypass-slow-div-constant-numerator.ll     |   2 +-
 .../NVPTX/bypass-slow-div-not-exact.ll        |   2 +-
 .../NVPTX/bypass-slow-div-special-cases.ll    |   2 +-
 .../CodeGenPrepare/NVPTX/bypass-slow-div.ll   |   2 +-
 .../NVPTX/dont-introduce-addrspacecast.ll     |   2 +-
 .../NVPTX/dont-sink-nop-addrspacecast.ll      |   2 +-
 .../PowerPC/split-store-alignment.ll          |   4 +-
 .../CodeGenPrepare/RISCV/and-mask-sink.ll     |   8 +-
 .../CodeGenPrepare/RISCV/cttz-ctlz.ll         |   2 +-
 .../SPARC/overflow-intrinsics.ll              |   6 +-
 .../CodeGenPrepare/X86/catchpad-phi-cast.ll   |   4 +-
 .../X86/cgp_shuffle_crash-inseltpoison.ll     |   2 +-
 .../CodeGenPrepare/X86/cgp_shuffle_crash.ll   |   2 +-
 .../CodeGenPrepare/X86/computedgoto.ll        |   2 +-
 .../CodeGenPrepare/X86/cttz-ctlz.ll           |  10 +-
 .../X86/delete-assume-dead-code.ll            |   2 +-
 .../CodeGenPrepare/X86/extend-sink-hoist.ll   |   2 +-
 .../CodeGenPrepare/X86/freeze-brcond.ll       |   2 +-
 .../X86/gather-scatter-opt-inseltpoison.ll    |   4 +-
 .../CodeGenPrepare/X86/gather-scatter-opt.ll  |   2 +-
 .../CodeGenPrepare/X86/gep-unmerging.ll       |   2 +-
 .../CodeGenPrepare/X86/invariant.group.ll     |   2 +-
 .../X86/masked-gather-struct-gep.ll           |   2 +-
 .../CodeGenPrepare/X86/nonintegral.ll         |   4 +-
 .../CodeGenPrepare/X86/optimizeSelect-DT.ll   |   2 +-
 .../CodeGenPrepare/X86/overflow-intrinsics.ll |   6 +-
 .../Transforms/CodeGenPrepare/X86/pr27536.ll  |   2 +-
 .../Transforms/CodeGenPrepare/X86/pr35658.ll  |   2 +-
 .../Transforms/CodeGenPrepare/X86/pr72046.ll  |   2 +-
 .../recursively-delete-dead-instructions.ll   |   2 +-
 .../CodeGenPrepare/X86/remove-assume-block.ll |   2 +-
 .../Transforms/CodeGenPrepare/X86/select.ll   |   6 +-
 .../CodeGenPrepare/X86/sink-addrmode-base.ll  |   4 +-
 .../X86/sink-addrmode-inseltpoison.ll         |   2 +-
 .../X86/sink-addrmode-select.ll               |   2 +-
 .../X86/sink-addrmode-two-phi.ll              |   2 +-
 .../CodeGenPrepare/X86/sink-addrmode.ll       |   2 +-
 .../CodeGenPrepare/X86/sink-addrspacecast.ll  |   2 +-
 .../CodeGenPrepare/X86/split-indirect-loop.ll |   2 +-
 .../X86/split-store-alignment.ll              |   2 +-
 .../CodeGenPrepare/X86/statepoint-relocate.ll |   2 +-
 .../CodeGenPrepare/X86/tailcall-assume-xbb.ll |   2 +-
 .../X86/vec-shift-inseltpoison.ll             |  14 +-
 .../CodeGenPrepare/X86/vec-shift.ll           |  14 +-
 .../CodeGenPrepare/X86/widenable-condition.ll |   2 +-
 .../X86/x86-shuffle-sink-inseltpoison.ll      |   8 +-
 .../CodeGenPrepare/X86/x86-shuffle-sink.ll    |   8 +-
 .../CodeGenPrepare/dead-allocation.ll         |   2 +-
 .../CodeGenPrepare/skip-merging-case-block.ll |   2 +-
 .../Transforms/HotColdSplit/coldentrycount.ll |   2 +-
 .../codegenprepare-produced-address-math.ll   |   2 +-
 .../section-accurate-samplepgo.ll             |   6 +-
 llvm/tools/opt/opt.cpp                        |   2 +-
 112 files changed, 399 insertions(+), 238 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/CodeGenPrepare.h

diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
index dfb8d5d9f2f5d..bba675f1d3eb7 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -21,11 +21,14 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Target/TargetMachine.h"
+
 using namespace llvm;
 
 namespace llvm {
@@ -72,25 +75,13 @@ template <> struct DenseMapInfo<UniqueBBID> {
   }
 };
 
-class BasicBlockSectionsProfileReader : public ImmutablePass {
+class BasicBlockSectionsProfileReader {
 public:
-  static char ID;
-
+  friend class BasicBlockSectionsProfileReaderWrapperPass;
   BasicBlockSectionsProfileReader(const MemoryBuffer *Buf)
-      : ImmutablePass(ID), MBuf(Buf),
-        LineIt(*Buf, /*SkipBlanks=*/true, /*CommentMarker=*/'#') {
-    initializeBasicBlockSectionsProfileReaderPass(
-        *PassRegistry::getPassRegistry());
-  };
+      : MBuf(Buf), LineIt(*Buf, /*SkipBlanks=*/true, /*CommentMarker=*/'#'){};
 
-  BasicBlockSectionsProfileReader() : ImmutablePass(ID) {
-    initializeBasicBlockSectionsProfileReaderPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override {
-    return "Basic Block Sections Profile Reader";
-  }
+  BasicBlockSectionsProfileReader(){};
 
   // Returns true if basic block sections profile exist for function \p
   // FuncName.
@@ -109,10 +100,6 @@ class BasicBlockSectionsProfileReader : public ImmutablePass {
   SmallVector<SmallVector<unsigned>>
   getClonePathsForFunction(StringRef FuncName) const;
 
-  // Initializes the FunctionNameToDIFilename map for the current module and
-  // then reads the profile for the matching functions.
-  bool doInitialization(Module &M) override;
-
 private:
   StringRef getAliasName(StringRef FuncName) const {
     auto R = FuncAliasMap.find(FuncName);
@@ -170,7 +157,61 @@ class BasicBlockSectionsProfileReader : public ImmutablePass {
 // sections profile. \p Buf is a memory buffer that contains the list of
 // functions and basic block ids to selectively enable basic block sections.
 ImmutablePass *
-createBasicBlockSectionsProfileReaderPass(const MemoryBuffer *Buf);
+createBasicBlockSectionsProfileReaderWrapperPass(const MemoryBuffer *Buf);
+
+/// Analysis pass providing the \c BasicBlockSectionsProfileReader.
+///
+/// Note that this pass's result cannot be invalidated, it is immutable for the
+/// life of the module.
+class BasicBlockSectionsProfileReaderAnalysis
+    : public AnalysisInfoMixin<BasicBlockSectionsProfileReaderAnalysis> {
+
+public:
+  static AnalysisKey Key;
+  typedef BasicBlockSectionsProfileReader Result;
+  BasicBlockSectionsProfileReaderAnalysis(const TargetMachine *TM) : TM(TM) {}
+
+  Result run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  const TargetMachine *TM;
+};
+
+class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass {
+public:
+  static char ID;
+  BasicBlockSectionsProfileReader BBSPR;
+
+  BasicBlockSectionsProfileReaderWrapperPass(const MemoryBuffer *Buf)
+      : ImmutablePass(ID), BBSPR(BasicBlockSectionsProfileReader(Buf)) {
+    initializeBasicBlockSectionsProfileReaderWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+  };
+
+  BasicBlockSectionsProfileReaderWrapperPass()
+      : ImmutablePass(ID), BBSPR(BasicBlockSectionsProfileReader()) {
+    initializeBasicBlockSectionsProfileReaderWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Basic Block Sections Profile Reader";
+  }
+
+  bool isFunctionHot(StringRef FuncName) const;
+
+  std::pair<bool, SmallVector<BBClusterInfo>>
+  getClusterInfoForFunction(StringRef FuncName) const;
+
+  SmallVector<SmallVector<unsigned>>
+  getClonePathsForFunction(StringRef FuncName) const;
+
+  // Initializes the FunctionNameToDIFilename map for the current module and
+  // then reads the profile for the matching functions.
+  bool doInitialization(Module &M) override;
+
+  BasicBlockSectionsProfileReader &getBBSPR();
+};
 
 } // namespace llvm
 #endif // LLVM_CODEGEN_BASICBLOCKSECTIONSPROFILEREADER_H
diff --git a/llvm/include/llvm/CodeGen/CodeGenPrepare.h b/llvm/include/llvm/CodeGen/CodeGenPrepare.h
new file mode 100644
index 0000000000000..dee3a9ee53d76
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/CodeGenPrepare.h
@@ -0,0 +1,35 @@
+//===- CodeGenPrepare.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Defines an IR pass for CodeGen Prepare.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PREPARE_H
+#define LLVM_CODEGEN_PREPARE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Function;
+class TargetMachine;
+
+class CodeGenPreparePass : public PassInfoMixin<CodeGenPreparePass> {
+private:
+  const TargetMachine *TM;
+
+public:
+  CodeGenPreparePass(const TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_PREPARE_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index ca9fbb1def762..bbfb8a0dbe26a 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -93,9 +93,9 @@ namespace llvm {
   MachineFunctionPass *createResetMachineFunctionPass(bool EmitFallbackDiag,
                                                       bool AbortOnFailedISel);
 
-  /// createCodeGenPreparePass - Transform the code to expose more pattern
+  /// createCodeGenPrepareLegacyPass - Transform the code to expose more pattern
   /// matching during instruction selection.
-  FunctionPass *createCodeGenPreparePass();
+  FunctionPass *createCodeGenPrepareLegacyPass();
 
   /// This pass implements generation of target-specific intrinsics to support
   /// handling of complex number arithmetic
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 46b1e95c3c15f..3db639a687240 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -54,7 +54,7 @@ void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
 void initializeAssumptionCacheTrackerPass(PassRegistry&);
 void initializeAtomicExpandPass(PassRegistry&);
 void initializeBasicBlockPathCloningPass(PassRegistry &);
-void initializeBasicBlockSectionsProfileReaderPass(PassRegistry &);
+void initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
 void initializeBasicBlockSectionsPass(PassRegistry &);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAAWrapperPassPass(PassRegistry&);
@@ -75,7 +75,7 @@ void initializeCallGraphDOTPrinterPass(PassRegistry&);
 void initializeCallGraphViewerPass(PassRegistry&);
 void initializeCallGraphWrapperPassPass(PassRegistry&);
 void initializeCheckDebugMachineModulePass(PassRegistry &);
-void initializeCodeGenPreparePass(PassRegistry&);
+void initializeCodeGenPrepareLegacyPassPass(PassRegistry &);
 void initializeComplexDeinterleavingLegacyPassPass(PassRegistry&);
 void initializeConstantHoistingLegacyPassPass(PassRegistry&);
 void initializeCycleInfoWrapperPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 7a21876e565a7..fe7fedad18bc0 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -113,7 +113,7 @@ namespace {
       (void) llvm::createTailCallEliminationPass();
       (void)llvm::createTLSVariableHoistPass();
       (void) llvm::createConstantHoistingPass();
-      (void) llvm::createCodeGenPreparePass();
+      (void)llvm::createCodeGenPrepareLegacyPass();
       (void) llvm::createEarlyCSEPass();
       (void) llvm::createGVNPass();
       (void) llvm::createPostDomTree();
diff --git a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
index 5d5f3c3da4816..901542e8507bd 100644
--- a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
+++ b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
@@ -196,7 +196,7 @@ class BasicBlockPathCloning : public MachineFunctionPass {
 public:
   static char ID;
 
-  BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
+  BasicBlockSectionsProfileReaderWrapperPass *BBSectionsProfileReader = nullptr;
 
   BasicBlockPathCloning() : MachineFunctionPass(ID) {
     initializeBasicBlockPathCloningPass(*PassRegistry::getPassRegistry());
@@ -218,7 +218,7 @@ INITIALIZE_PASS_BEGIN(
     BasicBlockPathCloning, "bb-path-cloning",
     "Applies path clonings for the -basic-block-sections=list option", false,
     false)
-INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader)
+INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass)
 INITIALIZE_PASS_END(
     BasicBlockPathCloning, "bb-path-cloning",
     "Applies path clonings for the -basic-block-sections=list option", false,
@@ -230,13 +230,14 @@ bool BasicBlockPathCloning::runOnMachineFunction(MachineFunction &MF) {
   if (hasInstrProfHashMismatch(MF))
     return false;
 
-  return ApplyCloning(MF, getAnalysis<BasicBlockSectionsProfileReader>()
-                              .getClonePathsForFunction(MF.getName()));
+  return ApplyCloning(MF,
+                      getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>()
+                          .getClonePathsForFunction(MF.getName()));
 }
 
 void BasicBlockPathCloning::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<BasicBlockSectionsProfileReader>();
+  AU.addRequired<BasicBlockSectionsProfileReaderWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index 42997d2287d61..94b5a503fbd0f 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -103,7 +103,7 @@ class BasicBlockSections : public MachineFunctionPass {
 public:
   static char ID;
 
-  BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
+  BasicBlockSectionsProfileReaderWrapperPass *BBSectionsProfileReader = nullptr;
 
   BasicBlockSections() : MachineFunctionPass(ID) {
     initializeBasicBlockSectionsPass(*PassRegistry::getPassRegistry());
@@ -128,7 +128,7 @@ INITIALIZE_PASS_BEGIN(
     "Prepares for basic block sections, by splitting functions "
     "into clusters of basic blocks.",
     false, false)
-INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader)
+INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass)
 INITIALIZE_PASS_END(BasicBlockSections, "bbsections-prepare",
                     "Prepares for basic block sections, by splitting functions "
                     "into clusters of basic blocks.",
@@ -306,7 +306,7 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
   DenseMap<UniqueBBID, BBClusterInfo> FuncClusterInfo;
   if (BBSectionsType == BasicBlockSection::List) {
     auto [HasProfile, ClusterInfo] =
-        getAnalysis<BasicBlockSectionsProfileReader>()
+        getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>()
             .getClusterInfoForFunction(MF.getName());
     if (!HasProfile)
       return false;
@@ -362,7 +362,7 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
 
 void BasicBlockSections::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<BasicBlockSectionsProfileReader>();
+  AU.addRequired<BasicBlockSectionsProfileReaderWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
index 15b6f63e86327..79e42d9304dfe 100644
--- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -30,8 +30,9 @@
 
 using namespace llvm;
 
-char BasicBlockSectionsProfileReader::ID = 0;
-INITIALIZE_PASS(BasicBlockSectionsProfileReader, "bbsections-profile-reader",
+char BasicBlockSectionsProfileReaderWrapperPass::ID = 0;
+INITIALIZE_PASS(BasicBlockSectionsProfileReaderWrapperPass,
+                "bbsections-profile-reader",
                 "Reads and parses a basic block sections profile.", false,
                 false)
 
@@ -395,11 +396,11 @@ Error BasicBlockSectionsProfileReader::ReadProfile() {
   }
 }
 
-bool BasicBlockSectionsProfileReader::doInitialization(Module &M) {
-  if (!MBuf)
+bool BasicBlockSectionsProfileReaderWrapperPass::doInitialization(Module &M) {
+  if (!BBSPR.MBuf)
     return false;
   // Get the function name to debug info filename mapping.
-  FunctionNameToDIFilename.clear();
+  BBSPR.FunctionNameToDIFilename.clear();
   for (const Function &F : M) {
     SmallString<128> DIFilename;
     if (F.isDeclaration())
@@ -411,15 +412,46 @@ bool BasicBlockSectionsProfileReader::doInitialization(Module &M) {
         DIFilename = sys::path::remove_leading_dotslash(CU->getFilename());
     }
     [[maybe_unused]] bool inserted =
-        FunctionNameToDIFilename.try_emplace(F.getName(), DIFilename).second;
+        BBSPR.FunctionNameToDIFilename.try_emplace(F.getName(), DIFilename)
+            .second;
     assert(inserted);
   }
-  if (auto Err = ReadProfile())
+  if (auto Err = BBSPR.ReadProfile())
     report_fatal_error(std::move(Err));
   return false;
 }
 
-ImmutablePass *
-llvm::createBasicBlockSectionsProfileReaderPass(const MemoryBuffer *Buf) {
-  return new BasicBlockSectionsProfileReader(Buf);
+AnalysisKey BasicBlockSectionsProfileReaderAnalysis::Key;
+
+BasicBlockSectionsProfileReader
+BasicBlockSectionsProfileReaderAnalysis::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  return BasicBlockSectionsProfileReader(TM->getBBSectionsFuncListBuf());
+}
+
+bool BasicBlockSectionsProfileReaderWrapperPass::isFunctionHot(
+    StringRef FuncName) const {
+  return BBSPR.isFunctionHot(FuncName);
+}
+
+std::pair<bool, SmallVector<BBClusterInfo>>
+BasicBlockSectionsProfileReaderWrapperPass::getClusterInfoForFunction(
+    StringRef FuncName) const {
+  return BBSPR.getClusterInfoForFunction(FuncName);
+}
+
+SmallVector<SmallVector<unsigned>>
+BasicBlockSectionsProfileReaderWrapperPass::getClonePathsForFunction(
+    StringRef FuncName) const {
+  return BBSPR.getClonePathsForFunction(FuncName);
+}
+
+BasicBlockSectionsProfileReader &
+BasicBlockSectionsProfileReaderWrapperPass::getBBSPR() {
+  return BBSPR;
+}
+
+ImmutablePass *llvm::createBasicBlockSectionsProfileReaderWrapperPass(
+    const MemoryBuffer *Buf) {
+  return new BasicBlockSectionsProfileReaderWrapperPass(Buf);
 }
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 7b73a7b11ddf1..418066452c172 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -30,7 +30,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeCFIFixupPass(Registry);
   initializeCFIInstrInserterPass(Registry);
   initializeCheckDebugMachineModulePass(Registry);
-  initializeCodeGenPreparePass(Registry);
+  initializeCodeGenPrepareLegacyPassPass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
   initializeDebugifyMachineModulePass(Registry);
   initializeDetectDeadLanesPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 5bd4c6b067d79..35a70f0b8c23a 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -301,7 +302,8 @@ using ValueToSExts = MapVector<Value *, SExts>;
 
 class TypePromotionTransaction;
 
-class CodeGenPrepare : public FunctionPass {
+class CodeGenPrepare {
+  friend class CodeGenPrepareLegacyPass;
   const TargetMachine *TM = nullptr;
   const TargetSubtargetInfo *SubtargetInfo = nullptr;
   const TargetLowering *TLI = nullptr;
@@ -365,6 +367,8 @@ class CodeGenPrepare : public FunctionPass {
   std::unique_ptr<DominatorTree> DT;
 
 public:
+  CodeGenPrepare(){};
+  CodeGenPrepare(const TargetMachine *TM) : TM(TM){};
   /// If encounter huge function, we need to limit the build time.
   bool IsHugeFunc = false;
 
@@ -374,15 +378,7 @@ class CodeGenPrepare : public FunctionPass {
   /// to insert such BB into FreshBBs for huge function.
   SmallSet<BasicBlock *, 32> FreshBBs;
 
-  static char ID; // Pass identification, replacement for typeid
-
-  CodeGenPrepare() : FunctionPass(ID) {
-    initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override;
-
-  void releaseMemory() override {
+  void releaseMemory() {
     // Clear per function information.
     InsertedInsts.clear();
     PromotedInsts.clear();
@@ -391,17 +387,7 @@ class CodeGenPrepare : public FunctionPass {
     BFI.reset();
   }
 
-  StringRef getPassName() const override { return "CodeGen Prepare"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    // FIXME: When we can selectively preserve passes, preserve the domtree.
-    AU.addRequired<ProfileSummaryInfoWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<TargetPassConfig>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addUsedIfAvailable<BasicBlockSectionsProfileReader>();
-  }
+  bool run(Function &F, FunctionAnalysisManager &AM);
 
 private:
   template <typename F>
@@ -488,45 +474,107 @@ class CodeGenPrepare : public FunctionPass {
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   void verifyBFIUpdates(Function &F);
+  bool _run(Function &F);
+};
+
+class CodeGenPrepareLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  CodeGenPrepareLegacyPass() : FunctionPass(ID) {
+    initializeCodeGenPrepareLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return "CodeGen Prepare"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // FIXME: When we can selectively preserve passes, preserve the domtree.
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addUsedIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();
+  }
 };
 
 } // end anonymous namespace
 
-char CodeGenPrepare::ID = 0;
+char CodeGenPrepareLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
+bool CodeGenPrepareLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  auto TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  CodeGenPrepare CGP(TM);
+  CGP.DL = &F.getParent()->getDataLayout();
+  CGP.SubtargetInfo = TM->getSubtargetImpl(F);
+  CGP.TLI = CGP.SubtargetInfo->getTargetLowering();
+  CGP.TRI = CGP.SubtargetInfo->getRegisterInfo();
+  CGP.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  CGP.TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  CGP.LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  CGP.BPI.reset(new BranchProbabilityInfo(F, *CGP.LI));
+  CGP.BFI.reset(new BlockFrequencyInfo(F, *CGP.BPI, *CGP.LI));
+  CGP.PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto BBSPRWP =
+      getAnalysisIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();
+  CGP.BBSectionsProfileReader = BBSPRWP ? &BBSPRWP->getBBSPR() : nullptr;
+
+  return CGP._run(F);
+}
+
+INITIALIZE_PASS_BEGIN(CodeGenPrepareLegacyPass, DEBUG_TYPE,
                       "Optimize for code generation", false, false)
-INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader)
+INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, "Optimize for code generation",
-                    false, false)
+INITIALIZE_PASS_END(CodeGenPrepareLegacyPass, DEBUG_TYPE,
+                    "Optimize for code generation", false, false)
 
-FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
+FunctionPass *llvm::createCodeGenPrepareLegacyPass() {
+  return new CodeGenPrepareLegacyPass();
+}
 
-bool CodeGenPrepare::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
+PreservedAnalyses CodeGenPreparePass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  CodeGenPrepare CGP(TM);
 
-  DL = &F.getParent()->getDataLayout();
+  bool Changed = CGP.run(F, AM);
+  if (!Changed)
+    return PreservedAnalyses::all();
 
-  bool EverMadeChange = false;
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
 
-  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+bool CodeGenPrepare::run(Function &F, FunctionAnalysisManager &AM) {
+  DL = &F.getParent()->getDataLayout();
   SubtargetInfo = TM->getSubtargetImpl(F);
   TLI = SubtargetInfo->getTargetLowering();
   TRI = SubtargetInfo->getRegisterInfo();
-  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLInfo = &AM.getResult<TargetLibraryAnalysis>(F);
+  TTI = &AM.getResult<TargetIRAnalysis>(F);
+  LI = &AM.getResult<LoopAnalysis>(F);
   BPI.reset(new BranchProbabilityInfo(F, *LI));
   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
-  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   BBSectionsProfileReader =
-      getAnalysisIfAvailable<BasicBlockSectionsProfileReader>();
+      AM.getCachedResult<BasicBlockSectionsProfileReaderAnalysis>(F);
+  return _run(F);
+}
+
+bool CodeGenPrepare::_run(Function &F) {
+  bool EverMadeChange = false;
+
   OptSize = F.hasOptSize();
   // Use the basic-block-sections profile to promote hot functions to .text.hot
   // if requested.
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 4003a08a5422d..3bbc792f4cbf4 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -978,7 +978,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
 /// before exception handling preparation passes.
 void TargetPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOptLevel::None && !DisableCGP)
-    addPass(createCodeGenPreparePass());
+    addPass(createCodeGenPrepareLegacyPass());
 }
 
 /// Add common passes that perform LLVM IR to IR transforms in preparation for
@@ -1271,7 +1271,7 @@ void TargetPassConfig::addMachinePasses() {
   // together. Update this check once we have addressed any issues.
   if (TM->getBBSectionsType() != llvm::BasicBlockSection::None) {
     if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) {
-      addPass(llvm::createBasicBlockSectionsProfileReaderPass(
+      addPass(llvm::createBasicBlockSectionsProfileReaderWrapperPass(
           TM->getBBSectionsFuncListBuf()));
       addPass(llvm::createBasicBlockPathCloningPass());
     }
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 439f749bda8bb..ca943b43404f2 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -72,7 +72,9 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
 #include "llvm/CodeGen/CallBrPrepare.h"
+#include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 82ce040c64962..a8965335b2ead 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -229,6 +229,7 @@ CGSCC_PASS_WITH_PARAMS(
 FUNCTION_ANALYSIS("aa", AAManager())
 FUNCTION_ANALYSIS("access-info", LoopAccessAnalysis())
 FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
+FUNCTION_ANALYSIS("bb-sections-profile-reader", BasicBlockSectionsProfileReaderAnalysis(TM))
 FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
 FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
 FUNCTION_ANALYSIS("cycles", CycleAnalysis())
@@ -290,6 +291,7 @@ FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
 FUNCTION_PASS("callbrprepare", CallBrPreparePass())
 FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
 FUNCTION_PASS("chr", ControlHeightReductionPass())
+FUNCTION_PASS("codegenprepare", CodeGenPreparePass(TM))
 FUNCTION_PASS("consthoist", ConstantHoistingPass())
 FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass())
 FUNCTION_PASS("coro-elide", CoroElidePass())
diff --git a/llvm/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll b/llvm/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll
index 92f29dac13fa4..10b594978bee2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare < %s -S | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/and-sink.ll b/llvm/test/CodeGen/AArch64/and-sink.ll
index f4e9551259e4e..4d085869de244 100644
--- a/llvm/test/CodeGen/AArch64/and-sink.ll
+++ b/llvm/test/CodeGen/AArch64/and-sink.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s
-; RUN: opt -S -codegenprepare -cgpp-huge-func=0 -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -cgpp-huge-func=0 -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s
 
 @A = dso_local global i32 zeroinitializer
 @B = dso_local global i32 zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
index 6041904dc0f31..2b42a3f29a72b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
 ; RUN: llc < %s -mtriple=arm64-eabi | FileCheck --check-prefix=LLC %s
 
 %struct.X = type { i8, i8, [2 x i8] }
diff --git a/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
index 23cbad0d15b4c..78a0bb4982b64 100644
--- a/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
@@ -1,6 +1,6 @@
-; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
-; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -stress-cgp-ext-ld-promotion | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
-; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -disable-cgp-ext-ld-promotion | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=DISABLE
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S -stress-cgp-ext-ld-promotion | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S -disable-cgp-ext-ld-promotion | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=DISABLE
 
 ; CodeGenPrepare should move the zext into the block with the load
 ; so that SelectionDAG can select it with the load.
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
index 964c669439d4d..c33159163901f 100644
--- a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -codegenprepare -mtriple=arm64_32-apple-ios %s -S -o - | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=arm64_32-apple-ios %s -S -o - | FileCheck %s
 
 define void @test_simple_sink(ptr %base, i64 %offset) {
 ; CHECK-LABEL: define void @test_simple_sink(
diff --git a/llvm/test/CodeGen/AArch64/cgp-trivial-phi-node.ll b/llvm/test/CodeGen/AArch64/cgp-trivial-phi-node.ll
index 98b820709e829..dfcedc8dd984d 100644
--- a/llvm/test/CodeGen/AArch64/cgp-trivial-phi-node.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-trivial-phi-node.ll
@@ -1,5 +1,5 @@
 ; Checks that case when GEP is bound to trivial PHI node is correctly handled.
-; RUN: opt %s -mtriple=aarch64-linux-gnu -codegenprepare -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=aarch64-linux-gnu -passes='require<profile-summary>,function(codegenprepare)' -S -o - | FileCheck %s
 
 ; CHECK:      define void @crash(ptr %s, i32 %n) {
 ; CHECK-NEXT: entry:
diff --git a/llvm/test/CodeGen/AArch64/convertphitype.ll b/llvm/test/CodeGen/AArch64/convertphitype.ll
index a5fc46d2abcaa..b723b470266a7 100644
--- a/llvm/test/CodeGen/AArch64/convertphitype.ll
+++ b/llvm/test/CodeGen/AArch64/convertphitype.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -cgp-optimize-phi-types %s -S | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -cgp-optimize-phi-types %s -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/scalable-vector-promotion.ll b/llvm/test/CodeGen/AArch64/scalable-vector-promotion.ll
index e6ab52dc9e619..a45f3733dbdae 100644
--- a/llvm/test/CodeGen/AArch64/scalable-vector-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/scalable-vector-promotion.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=aarch64 -codegenprepare -S < %s | FileCheck %s
+; RUN: opt -mtriple=aarch64 -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
 
 ; This test intends to check vector promotion for scalable vector. Current target lowering
 ; rejects scalable vector before reaching getConstantVector() in CodeGenPrepare. This test
diff --git a/llvm/test/CodeGen/AArch64/sve-vscale.ll b/llvm/test/CodeGen/AArch64/sve-vscale.ll
index fa48808ff7f81..7214c98cc3186 100644
--- a/llvm/test/CodeGen/AArch64/sve-vscale.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vscale.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple aarch64 -mattr=+sve -asm-verbose=0 < %s | FileCheck %s
-; RUN: opt -mtriple=aarch64 -codegenprepare -S < %s | llc -mtriple=aarch64 -mattr=+sve -asm-verbose=0 | FileCheck %s
+; RUN: opt -mtriple=aarch64 -passes='require<profile-summary>,function(codegenprepare)' -S < %s | llc -mtriple=aarch64 -mattr=+sve -asm-verbose=0 | FileCheck %s
 
 ;
 ; RDVL
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
index c80aa82ef9a83..bf3082d99c669 100644
--- a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -o - %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 3005b17e0524c..88a8e7cc42203 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck --check-prefixes=OPT,OPT-GFX7 %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck --check-prefixes=OPT,OPT-GFX8 %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck --check-prefixes=OPT,OPT-GFX9 %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx1030 < %s | FileCheck --check-prefixes=OPT,OPT-GFX10 %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck --check-prefixes=OPT,OPT-GFX7 %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck --check-prefixes=OPT,OPT-GFX8 %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck --check-prefixes=OPT,OPT-GFX9 %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=gfx1030 < %s | FileCheck --check-prefixes=OPT,OPT-GFX10 %s
 
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GFX7 %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index aee6f0e82d251..1588dde19cfb7 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s
 
 ; Make sure we match the addressing mode offset of csub intrinsics across blocks.
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index 494b4b5c48ba9..ac50fb86c96f7 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s
 
 ; Make sure we match the addressing mode offset of globla.atomic.fadd intrinsics across blocks.
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 0e61547c27b45..65a604c98c4bd 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,7 +1,7 @@
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
diff --git a/llvm/test/CodeGen/ARM/vector-promotion.ll b/llvm/test/CodeGen/ARM/vector-promotion.ll
index 3e314306ff083..f4a2a4a4521e9 100644
--- a/llvm/test/CodeGen/ARM/vector-promotion.ll
+++ b/llvm/test/CodeGen/ARM/vector-promotion.ll
@@ -1,5 +1,5 @@
-; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
-; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
 ; RUN: llc -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
 
 ; IR-BOTH-LABEL: @simpleOneInstructionPromotion
diff --git a/llvm/test/CodeGen/Generic/addr-sink-call-multi-arg.ll b/llvm/test/CodeGen/Generic/addr-sink-call-multi-arg.ll
index b02bdc3b57242..2a8d05c4f26e9 100644
--- a/llvm/test/CodeGen/Generic/addr-sink-call-multi-arg.ll
+++ b/llvm/test/CodeGen/Generic/addr-sink-call-multi-arg.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 ; REQUIRES: aarch64-registered-target
 
 ; Check that we don't give up if unable to sink the first argument.
diff --git a/llvm/test/CodeGen/Generic/addr-use-count.ll b/llvm/test/CodeGen/Generic/addr-use-count.ll
index 00943b5a58e2b..5c4c0c618794c 100644
--- a/llvm/test/CodeGen/Generic/addr-use-count.ll
+++ b/llvm/test/CodeGen/Generic/addr-use-count.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 ; REQUIRES: aarch64-registered-target
 
 ; Test that `%addr` is sunk, after we've increased limit on the number of the memory uses to scan.
diff --git a/llvm/test/CodeGen/X86/callbr-codegenprepare.ll b/llvm/test/CodeGen/X86/callbr-codegenprepare.ll
index 854cc4bf7a9cc..d2380a730b5b3 100644
--- a/llvm/test/CodeGen/X86/callbr-codegenprepare.ll
+++ b/llvm/test/CodeGen/X86/callbr-codegenprepare.ll
@@ -1,4 +1,4 @@
-;; RUN: opt -S -codegenprepare < %s | FileCheck %s
+;; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 ;; Ensure that codegenprepare (via InstSimplify) doesn't eliminate the
 ;; phi here (which would cause a module verification error).
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index 6e95c91e73989..c611e89f27865 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s -o - | FileCheck %s
 ; This file tests the different cases what are involved when codegen prepare
 ; tries to get sign/zero extension out of the way of addressing mode.
 ; This tests require an actual target as addressing mode decisions depends
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-extload.ll b/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
index ff0ac5bfd6a7d..ce87985fb3a05 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win64 | FileCheck %s
-; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
-; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S -stress-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
-; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S -disable-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=DISABLE
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=x86_64-apple-macosx -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=x86_64-apple-macosx -S -stress-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=x86_64-apple-macosx -S -disable-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=DISABLE
 
 ; rdar://7304838
 ; CodeGenPrepare should move the zext into the block with the load
diff --git a/llvm/test/CodeGen/X86/convertphitype.ll b/llvm/test/CodeGen/X86/convertphitype.ll
index df01612252bf1..6c77236fc6988 100644
--- a/llvm/test/CodeGen/X86/convertphitype.ll
+++ b/llvm/test/CodeGen/X86/convertphitype.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -cgp-optimize-phi-types=true %s -S | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -cgp-optimize-phi-types=true %s -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/X86/indirect-br-gep-unmerge.ll b/llvm/test/CodeGen/X86/indirect-br-gep-unmerge.ll
index 6b953e3004256..ac23de49d3bd6 100644
--- a/llvm/test/CodeGen/X86/indirect-br-gep-unmerge.ll
+++ b/llvm/test/CodeGen/X86/indirect-br-gep-unmerge.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s -o - | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/X86/pr58538.ll b/llvm/test/CodeGen/X86/pr58538.ll
index 6c4103718950e..7bae2594fc020 100644
--- a/llvm/test/CodeGen/X86/pr58538.ll
+++ b/llvm/test/CodeGen/X86/pr58538.ll
@@ -1,5 +1,5 @@
-; RUN: opt -codegenprepare -mtriple=x86_64 %s -S -o - | FileCheck %s
-; RUN: opt -codegenprepare -mtriple=i386 %s -S -o - | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64 %s -S -o - | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=i386 %s -S -o - | FileCheck %s
 
 define i32 @f(i32 %0) {
 ; CHECK-LABEL: @f
diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index 48440558283d4..75bbae1050d61 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
-; RUN: opt -S -codegenprepare %s -mtriple=x86_64-apple-darwin -o - | FileCheck %s --check-prefix OPT
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s -mtriple=x86_64-apple-darwin -o - | FileCheck %s --check-prefix OPT
 
 ; Teach CGP to dup returns to enable tail call optimization.
 ; rdar://9147433
diff --git a/llvm/test/CodeGen/X86/tailcall-extract.ll b/llvm/test/CodeGen/X86/tailcall-extract.ll
index c3597a8e5b99e..7a6c75c44ca7d 100644
--- a/llvm/test/CodeGen/X86/tailcall-extract.ll
+++ b/llvm/test/CodeGen/X86/tailcall-extract.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
-; RUN: opt -codegenprepare -S -mtriple=x86_64-linux < %s | FileCheck %s --check-prefix OPT
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64-linux < %s | FileCheck %s --check-prefix OPT
 
 
 ; The exit block containing extractvalue can be duplicated into the BB
diff --git a/llvm/test/DebugInfo/ARM/salvage-debug-info.ll b/llvm/test/DebugInfo/ARM/salvage-debug-info.ll
index 3717abada42e1..1564a80ded0e8 100644
--- a/llvm/test/DebugInfo/ARM/salvage-debug-info.ll
+++ b/llvm/test/DebugInfo/ARM/salvage-debug-info.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S %s -o - | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S %s -o - | FileCheck %s
 ; typedef struct info {
 ;   unsigned long long size;
 ; } info_t;
diff --git a/llvm/test/DebugInfo/X86/zextload.ll b/llvm/test/DebugInfo/X86/zextload.ll
index 888e230c258da..05d92b09c20c2 100644
--- a/llvm/test/DebugInfo/X86/zextload.ll
+++ b/llvm/test/DebugInfo/X86/zextload.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 ;
 ; This test case was generated from the following source code:
 ; 
diff --git a/llvm/test/Other/codegenprepare-and-debug.ll b/llvm/test/Other/codegenprepare-and-debug.ll
index 9023e8f219976..95f48c630efb6 100644
--- a/llvm/test/Other/codegenprepare-and-debug.ll
+++ b/llvm/test/Other/codegenprepare-and-debug.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
 ; RUN: opt -strip-debug -codegenprepare -S < %s | FileCheck %s
 ; REQUIRES: x86-registered-target
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/combine-address-mode.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/combine-address-mode.ll
index 91194864c013f..25d4492f4c169 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/combine-address-mode.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/combine-address-mode.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 
 @_MergedGlobals = external dso_local global <{ i32, i32 }>, align 4
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/free-zext.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/free-zext.ll
index adb13f1a4c9fc..de7eeada6024a 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/free-zext.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/free-zext.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck -enable-var-scope %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=aarch64-linux %s | FileCheck -enable-var-scope %s
 
 ; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
 ; feeding a phi that zext's each loaded value.
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
index 0114d7f9f4091..469d818af28f0 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
index e4c5b4ceee542..6444f6adcdcca 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
index 4caf6d0dc8931..f72679f55e114 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
-; RUN: opt -enable-debugify -codegenprepare -S < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
+; RUN: opt -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' -S < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
 
 ; Subset of tests from llvm/tests/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
 ; to test shouldFormOverflowOp on SPARC, where it is not profitable to create
@@ -167,5 +167,5 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
   ret i1 %ov
 }
 
-; Check that every instruction inserted by -codegenprepare has a debug location.
+; Check that every instruction inserted by -passes='require<profile-summary>,function(codegenprepare)' has a debug location.
 ; DEBUG: CheckModuleDebugify: PASS
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
index 73322836d1b84..f170c8ff18c1f 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S --codegenprepare < %s | FileCheck %s
+; RUN: opt -S --passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/trunc-weird-user.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/trunc-weird-user.ll
index 6a8b5733889e4..fa53d536fa389 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/trunc-weird-user.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/trunc-weird-user.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare -mtriple=arm64-apple-ios7.0 %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=arm64-apple-ios7.0 %s | FileCheck %s
 
 %foo = type { i8 }
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
index 60b1e81e3dcd3..8999e8d901add 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -S %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios"
diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/addressing-modes.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/addressing-modes.ll
index acd40d744f717..20a5a9ededac0 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/addressing-modes.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/addressing-modes.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn--amdhsa < %s | FileCheck %s
 
 define amdgpu_kernel void @test_sink_as999_small_max_mubuf_offset(ptr addrspace(999) %out, ptr addrspace(999) %in) {
 ; CHECK-LABEL: @test_sink_as999_small_max_mubuf_offset(
diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
index 63098ab098a22..c64d5a357f3a9 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=ASC -check-prefix=COMMON %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=ASC -check-prefix=COMMON %s
 
 ; COMMON-LABEL: @test_sink_ptrtoint_asc(
 ; ASC: addrspacecast
diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
index 7e6020629df82..77e79a902f3be 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -codegenprepare -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn--amdhsa < %s | FileCheck %s
 
 define i64 @no_sink_local_to_flat(i1 %pred, ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define i64 @no_sink_local_to_flat(
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
index 996cab1d1d2c4..ff5cef7e781fe 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-none-eabi"
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/dead-gep.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/dead-gep.ll
index 52c06fd52b7b3..f84491933b259 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/dead-gep.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/dead-gep.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S %s -o - | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S %s -o - | FileCheck %s
 target triple = "thumbv7-apple-ios7.0.0"
 
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/memory-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/memory-intrinsics.ll
index ae76dbda4aa15..9bec9b53ed0cd 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/memory-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/memory-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -mtriple=arm7-unknown-unknown -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=arm7-unknown-unknown -S < %s | FileCheck %s
 
 declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1) nounwind
 declare void @llvm.memmove.p0.p0.i32(ptr, ptr, i32, i1) nounwind
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/overflow-intrinsics.ll
index 3fbc21331410d..2f32ef94f82cd 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/overflow-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8m.main-arm-none-eabi"
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
index 838486aa24860..49cffe286a545 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare -mtriple=thumbv7m -disable-complex-addr-modes=false -addr-sink-new-select=true -addr-sink-new-phis=true < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=thumbv7m -disable-complex-addr-modes=false -addr-sink-new-select=true -addr-sink-new-phis=true < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/splitgep.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/splitgep.ll
index cd2087d941497..69c8b92a3558b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/splitgep.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/splitgep.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-arm-none-eabi"
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/tailcall-dup.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/tailcall-dup.ll
index 76b119fe36aa6..3f113e6ea1632 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/tailcall-dup.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/tailcall-dup.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
 
 target triple = "armv8m.main-none-eabi"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-constant-numerator.ll b/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-constant-numerator.ll
index 94adf1970938e..b8a45e847afc8 100644
--- a/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-constant-numerator.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-constant-numerator.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-not-exact.ll b/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-not-exact.ll
index c571da4411e76..d46aaf51a59c1 100644
--- a/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-not-exact.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-not-exact.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll b/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
index 21e47c614ad05..9ec533c2e6531 100644
--- a/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div.ll b/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div.ll
index 424f7c3b02715..463fa0d487a7c 100644
--- a/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/Transforms/CodeGenPrepare/NVPTX/dont-introduce-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/NVPTX/dont-introduce-addrspacecast.ll
index 17b0dbf81ac20..af9b5ccd72874 100644
--- a/llvm/test/Transforms/CodeGenPrepare/NVPTX/dont-introduce-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/NVPTX/dont-introduce-addrspacecast.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/Transforms/CodeGenPrepare/NVPTX/dont-sink-nop-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/NVPTX/dont-sink-nop-addrspacecast.ll
index 374f30dba5087..7ee7f0550318e 100644
--- a/llvm/test/Transforms/CodeGenPrepare/NVPTX/dont-sink-nop-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/NVPTX/dont-sink-nop-addrspacecast.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/Transforms/CodeGenPrepare/PowerPC/split-store-alignment.ll b/llvm/test/Transforms/CodeGenPrepare/PowerPC/split-store-alignment.ll
index 79aebb9c247ad..65177d5ae3d7d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/PowerPC/split-store-alignment.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/PowerPC/split-store-alignment.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=powerpc64-unknown-linux-gnu -data-layout="E-m:e-i64:64-n32:64" -force-split-store < %s  | FileCheck --check-prefix=BE %s
-; RUN: opt -S -codegenprepare -mtriple=powerpc64le-unknown-linux-gnu -data-layout="e-m:e-i64:64-n32:64" -force-split-store < %s  | FileCheck --check-prefix=LE %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=powerpc64-unknown-linux-gnu -data-layout="E-m:e-i64:64-n32:64" -force-split-store < %s  | FileCheck --check-prefix=BE %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=powerpc64le-unknown-linux-gnu -data-layout="e-m:e-i64:64-n32:64" -force-split-store < %s  | FileCheck --check-prefix=LE %s
 
 define void @split_store_align1(float %x, ptr %p) {
 ; BE-LABEL: @split_store_align1(
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/and-mask-sink.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/and-mask-sink.ll
index 130401d781718..863b0b4ad26f6 100644
--- a/llvm/test/Transforms/CodeGenPrepare/RISCV/and-mask-sink.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/and-mask-sink.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=riscv32 %s \
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv32 %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,NOZBS %s
-; RUN: opt -S -codegenprepare -mtriple=riscv32 -mattr=+zbs %s \
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv32 -mattr=+zbs %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,ZBS %s
-; RUN: opt -S -codegenprepare -mtriple=riscv64 %s \
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv64 %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,NOZBS %s
-; RUN: opt -S -codegenprepare -mtriple=riscv64 -mattr=zbs %s \
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv64 -mattr=zbs %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,ZBS %s
 
 @A = global i32 zeroinitializer
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/cttz-ctlz.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/cttz-ctlz.ll
index c70112e91ebd9..00ad32e967489 100644
--- a/llvm/test/Transforms/CodeGenPrepare/RISCV/cttz-ctlz.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/cttz-ctlz.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target triple = "riscv64-unknown-unknown"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll
index 7525ae14fa352..ec60238cbf927 100644
--- a/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
-; RUN: opt -enable-debugify -codegenprepare -S < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
+; RUN: opt -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' -S < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
 
 ; Subset of tests from llvm/tests/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
 ; to test shouldFormOverflowOp on SPARC, where it is not profitable to create
@@ -119,5 +119,5 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
   ret i1 %ov
 }
 
-; Check that every instruction inserted by -codegenprepare has a debug location.
+; Check that every instruction inserted by -passes='require<profile-summary>,function(codegenprepare)' has a debug location.
 ; DEBUG: CheckModuleDebugify: PASS
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/catchpad-phi-cast.ll b/llvm/test/Transforms/CodeGenPrepare/X86/catchpad-phi-cast.ll
index 614e80e3e89c2..cb617671827e7 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/catchpad-phi-cast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/catchpad-phi-cast.ll
@@ -1,5 +1,5 @@
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
-; RUN: opt -codegenprepare -S < %s --try-experimental-debuginfo-iterators | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s --try-experimental-debuginfo-iterators | FileCheck %s
 
 ; The following target lines are needed for the test to exercise what it should.
 ; Without these lines, CodeGenPrepare does not try to sink the bitcasts.
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash-inseltpoison.ll
index 9eede8cf361bc..bcb7edd6e91be 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash-inseltpoison.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash.ll
index 7433ff74bab54..9ce830c394777 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/computedgoto.ll b/llvm/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
index fbf294128163f..c1b919d31e076 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
index 5f368faf46ee3..3a3a5327da8df 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s --check-prefix=SLOW
-; RUN: opt -S -codegenprepare -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ
-; RUN: opt -S -codegenprepare -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s --check-prefix=SLOW
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ
 
-; RUN: opt -S -debugify -codegenprepare < %s | FileCheck %s --check-prefix=DEBUGINFO
-; RUN: opt -S -debugify -codegenprepare --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO
+; RUN: opt -S -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s --check-prefix=DEBUGINFO
+; RUN: opt -S -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO
 
 target triple = "x86_64-unknown-unknown"
 target datalayout = "e-n32:64"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/delete-assume-dead-code.ll b/llvm/test/Transforms/CodeGenPrepare/X86/delete-assume-dead-code.ll
index abdf54c6e5426..e1d99fd932fb7 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/delete-assume-dead-code.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/delete-assume-dead-code.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -S -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64-linux < %s | FileCheck %s
 
 define i32 @test1(ptr %d) nounwind {
 ; CHECK-LABEL: @test1(
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll b/llvm/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll
index a0814e0a5f20c..5349afc18d84d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -disable-cgp-branch-opts -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -disable-cgp-branch-opts -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/freeze-brcond.ll b/llvm/test/Transforms/CodeGenPrepare/X86/freeze-brcond.ll
index c37227f5fa820..e9ecfd1615d45 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/freeze-brcond.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/freeze-brcond.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
index 41b1ac2c05fcc..e62ba5d5a7f55 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
-; RUN: opt -S -codegenprepare -cgpp-huge-func=0 < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -cgpp-huge-func=0 < %s | FileCheck %s
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
index 2cf98491acb95..7899477afdb2a 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gep-unmerging.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gep-unmerging.ll
index d2eae6954c5d8..a2ea3f7e36a02 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gep-unmerging.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gep-unmerging.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S -mtriple=x86_64 < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64 < %s | FileCheck %s
 
 @exit_addr = constant ptr blockaddress(@gep_unmerging, %exit)
 @op1_addr = constant ptr blockaddress(@gep_unmerging, %op1)
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/invariant.group.ll b/llvm/test/Transforms/CodeGenPrepare/X86/invariant.group.ll
index 3c81a4beb8344..a0bac01d11657 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/invariant.group.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/invariant.group.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S -mtriple=x86_64 < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64 < %s | FileCheck %s
 
 @tmp = global i8 0
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll b/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
index ea07a5fe9bc5b..dbd5e87f2c28d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 ; REQUIRES: x86-registered-target
 target triple = "x86_64-pc-linux"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/nonintegral.ll b/llvm/test/Transforms/CodeGenPrepare/X86/nonintegral.ll
index 2f42ad889b422..9d53855ada794 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/nonintegral.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/nonintegral.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
-; RUN: opt -S -codegenprepare -addr-sink-using-gep=false < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -addr-sink-using-gep=false < %s | FileCheck %s
 
 ; This target data layout is modified to have a non-integral addrspace(1),
 ; in order to verify that codegenprepare does not try to introduce illegal
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll b/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
index be651d7eb0044..aaf3df0934681 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
index a324f6f44e5c3..653f346356488 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
-; RUN: opt -enable-debugify -codegenprepare -S < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
+; RUN: opt -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' -S < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -636,6 +636,6 @@ exit:
   ret void
 }
 
-; Check that every instruction inserted by -codegenprepare has a debug location.
+; Check that every instruction inserted by -passes='require<profile-summary>,function(codegenprepare)' has a debug location.
 ; DEBUG: CheckModuleDebugify: PASS
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/pr27536.ll b/llvm/test/Transforms/CodeGenPrepare/X86/pr27536.ll
index 3ef27c7c950fb..51fba2229f3c1 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/pr27536.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/pr27536.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/pr35658.ll b/llvm/test/Transforms/CodeGenPrepare/X86/pr35658.ll
index eec9475a1c48e..e9d0806235c9f 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/pr35658.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/pr35658.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=true -addr-sink-new-select=true  %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -disable-complex-addr-modes=false -addr-sink-new-phis=true -addr-sink-new-select=true  %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/pr72046.ll b/llvm/test/Transforms/CodeGenPrepare/X86/pr72046.ll
index d75e5632ebb2e..b6296871f5754 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/pr72046.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/pr72046.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
 
 ; Make sure the nneg flag is dropped when lshr and zext are interchanged.
 define i8 @get(ptr %box, i32 %in) {
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/recursively-delete-dead-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/X86/recursively-delete-dead-instructions.ll
index 0366b7d7e6d2e..eff88bba3773b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/recursively-delete-dead-instructions.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/recursively-delete-dead-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64-linux < %s | FileCheck %s
 
 declare void @llvm.assume(i1 noundef) nounwind willreturn
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/remove-assume-block.ll b/llvm/test/Transforms/CodeGenPrepare/X86/remove-assume-block.ll
index 1d5e6ea0978ad..6b7a122b3e261 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/remove-assume-block.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/remove-assume-block.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-linux < %s | FileCheck %s
 ;
 ; Ensure that blocks that only contain @llvm.assume are removed completely
 ; during CodeGenPrepare.
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/select.ll b/llvm/test/Transforms/CodeGenPrepare/X86/select.ll
index a0f34a882d306..08dd77e9e4c3d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/select.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/select.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
-; RUN: opt -debugify -codegenprepare -S < %s | FileCheck %s -check-prefix=DEBUG
-; RUN: opt -debugify -codegenprepare -S < %s --try-experimental-debuginfo-iterators | FileCheck %s -check-prefix=DEBUG
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
+; RUN: opt -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s -check-prefix=DEBUG
+; RUN: opt -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' -S < %s --try-experimental-debuginfo-iterators | FileCheck %s -check-prefix=DEBUG
 
 target triple = "x86_64-unknown-unknown"
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
index 45ddbe76c8f1e..08e822f7e2112 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=true -addr-sink-new-select=true -disable-cgp-delete-phis  %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-YES
-; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=false -addr-sink-new-select=true -disable-cgp-delete-phis %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -disable-complex-addr-modes=false -addr-sink-new-phis=true -addr-sink-new-select=true -disable-cgp-delete-phis  %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-YES
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -disable-complex-addr-modes=false -addr-sink-new-phis=false -addr-sink-new-select=true -disable-cgp-delete-phis %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-inseltpoison.ll
index d5e69b9d802ef..7660ee47fdbd6 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-inseltpoison.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll
index 336421e4c5008..076915028aef6 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-select=true  %s | FileCheck %s --check-prefix=CHECK
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -disable-complex-addr-modes=false -addr-sink-new-select=true  %s | FileCheck %s --check-prefix=CHECK
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-two-phi.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-two-phi.ll
index 611ef908d706d..6a6b029b67b51 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-two-phi.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-two-phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -disable-cgp-delete-phis %s | FileCheck %s --check-prefix=CHECK
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -disable-complex-addr-modes=false -disable-cgp-delete-phis %s | FileCheck %s --check-prefix=CHECK
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
index 97b11a2e1f1c9..f75af606eff05 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
index f2e82212d0fac..a760f56b151fa 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s -check-prefix=CHECK -check-prefix=GEP
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s -check-prefix=CHECK -check-prefix=GEP
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/split-indirect-loop.ll b/llvm/test/Transforms/CodeGenPrepare/X86/split-indirect-loop.ll
index c5d18ff503099..7f7f80910b487 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/split-indirect-loop.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/split-indirect-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S -mtriple=x86_64 < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64 < %s | FileCheck %s
 
 ; Test that an invalid CFG is not created by splitIndirectCriticalEdges
 ; transformation when the 'target' block is a loop to itself.
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/split-store-alignment.ll b/llvm/test/Transforms/CodeGenPrepare/X86/split-store-alignment.ll
index 3bced480f31ac..0335da94ea504 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/split-store-alignment.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/split-store-alignment.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -mtriple=x86_64-unknown-unknown -force-split-store -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-unknown-unknown -force-split-store -S < %s | FileCheck %s
 
 target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i686-w64-windows-gnu"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/statepoint-relocate.ll b/llvm/test/Transforms/CodeGenPrepare/X86/statepoint-relocate.ll
index a8a6f7baf9b48..babaa08a959b3 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/statepoint-relocate.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/statepoint-relocate.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
 
 target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux-gnu"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/tailcall-assume-xbb.ll b/llvm/test/Transforms/CodeGenPrepare/X86/tailcall-assume-xbb.ll
index 9dc88a100daad..dd47d5eb6cc46 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/tailcall-assume-xbb.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/tailcall-assume-xbb.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -S -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64-linux < %s | FileCheck %s
 
 ; The ret instruction can be duplicated into BB case2 even though there is an
 ; intermediate BB exit1 and call to llvm.assume.
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
index 557974fcfe54f..db7d960899ca6 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx -S < %s | FileCheck %s --check-prefixes=AVX1
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx2 -S < %s | FileCheck %s --check-prefixes=AVX2
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx512bw -S < %s | FileCheck %s --check-prefixes=AVX512BW
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx,+xop -S < %s | FileCheck %s --check-prefixes=XOP
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx2,+xop -S < %s | FileCheck %s --check-prefixes=XOP
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx -S -enable-debugify < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx -S < %s | FileCheck %s --check-prefixes=AVX1
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx2 -S < %s | FileCheck %s --check-prefixes=AVX2
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx512bw -S < %s | FileCheck %s --check-prefixes=AVX512BW
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx,+xop -S < %s | FileCheck %s --check-prefixes=XOP
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx2,+xop -S < %s | FileCheck %s --check-prefixes=XOP
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx -S -enable-debugify < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
 
 define <4 x i32> @vector_variable_shift_right_v4i32(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; AVX1-LABEL: @vector_variable_shift_right_v4i32(
@@ -409,5 +409,5 @@ exit:
 
 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) #1
 
-; Check that every instruction inserted by -codegenprepare has a debug location.
+; Check that every instruction inserted by -passes='require<profile-summary>,function(codegenprepare)' has a debug location.
 ; DEBUG: CheckModuleDebugify: PASS
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll
index 482e822ea3d80..e0f04f77efa09 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx -S < %s | FileCheck %s --check-prefixes=AVX1
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx2 -S < %s | FileCheck %s --check-prefixes=AVX2
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx512bw -S < %s | FileCheck %s --check-prefixes=AVX512BW
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx,+xop -S < %s | FileCheck %s --check-prefixes=XOP
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx2,+xop -S < %s | FileCheck %s --check-prefixes=XOP
-; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx -S -enable-debugify < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx -S < %s | FileCheck %s --check-prefixes=AVX1
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx2 -S < %s | FileCheck %s --check-prefixes=AVX2
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx512bw -S < %s | FileCheck %s --check-prefixes=AVX512BW
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx,+xop -S < %s | FileCheck %s --check-prefixes=XOP
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx2,+xop -S < %s | FileCheck %s --check-prefixes=XOP
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx -S -enable-debugify < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
 
 define <4 x i32> @vector_variable_shift_right_v4i32(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; AVX1-LABEL: @vector_variable_shift_right_v4i32(
@@ -409,5 +409,5 @@ exit:
 
 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) #1
 
-; Check that every instruction inserted by -codegenprepare has a debug location.
+; Check that every instruction inserted by -passes='require<profile-summary>,function(codegenprepare)' has a debug location.
 ; DEBUG: CheckModuleDebugify: PASS
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/widenable-condition.ll b/llvm/test/Transforms/CodeGenPrepare/X86/widenable-condition.ll
index b26876e0e1e26..12230ec689cf9 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/widenable-condition.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/widenable-condition.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegenprepare -S -mtriple=x86_64 < %s | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64 < %s | FileCheck %s
 
 ; Check the idiomatic guard pattern to ensure it's lowered correctly.
 define void @test_guard(i1 %cond_0) {
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
index 72d1672eb4f7d..ce1b6bd5ae63d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mcpu=corei7 %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
-; RUN: opt -S -codegenprepare -mcpu=bdver2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
-; RUN: opt -S -codegenprepare -mcpu=core-avx2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
-; RUN: opt -S -codegenprepare -mcpu=skylake-avx512 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512BW
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=corei7 %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=bdver2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=core-avx2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=skylake-avx512 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512BW
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin10.9.0"
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll b/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll
index c14918a6956f1..9e82844dfc2fa 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mcpu=corei7 %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
-; RUN: opt -S -codegenprepare -mcpu=bdver2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
-; RUN: opt -S -codegenprepare -mcpu=core-avx2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
-; RUN: opt -S -codegenprepare -mcpu=skylake-avx512 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512BW
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=corei7 %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=bdver2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=core-avx2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=skylake-avx512 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512BW
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin10.9.0"
diff --git a/llvm/test/Transforms/CodeGenPrepare/dead-allocation.ll b/llvm/test/Transforms/CodeGenPrepare/dead-allocation.ll
index 637040a0d56d2..9550e748da6d1 100644
--- a/llvm/test/Transforms/CodeGenPrepare/dead-allocation.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/dead-allocation.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Eliminate the dead allocation instruction
 ; REQUIRES: arm-registered-target
-; RUN: opt -codegenprepare < %s -S | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-gnueabihf"
diff --git a/llvm/test/Transforms/CodeGenPrepare/skip-merging-case-block.ll b/llvm/test/Transforms/CodeGenPrepare/skip-merging-case-block.ll
index 608ad4c0a32fa..d25b9c91aff60 100644
--- a/llvm/test/Transforms/CodeGenPrepare/skip-merging-case-block.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/skip-merging-case-block.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: aarch64-registered-target
-; RUN: opt -codegenprepare  < %s  -mtriple=aarch64-none-linux-gnu -S  | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)'  < %s  -mtriple=aarch64-none-linux-gnu -S  | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/llvm/test/Transforms/HotColdSplit/coldentrycount.ll b/llvm/test/Transforms/HotColdSplit/coldentrycount.ll
index 1a113ff161883..6e5ef1aa25392 100644
--- a/llvm/test/Transforms/HotColdSplit/coldentrycount.ll
+++ b/llvm/test/Transforms/HotColdSplit/coldentrycount.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: x86-registered-target
-; RUN: opt -passes=hotcoldsplit -hotcoldsplit-threshold=0 < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -passes=hotcoldsplit -hotcoldsplit-threshold=0 < %s | opt -passes='require<profile-summary>,function(codegenprepare)' -S | FileCheck %s
 
 ; Test to ensure that split cold function gets 0 entry count profile
 ; metadata when compiling with pgo.
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
index ff8a804beeb51..a56efe8dd3f35 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -passes=load-store-vectorizer %s -S -o - | FileCheck %s
 ; RUN: opt                 -passes=load-store-vectorizer %s -S -o - | FileCheck %s
 ; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
 
diff --git a/llvm/test/Transforms/SampleProfile/section-accurate-samplepgo.ll b/llvm/test/Transforms/SampleProfile/section-accurate-samplepgo.ll
index a404220056c8e..ef2ddbc33cee4 100644
--- a/llvm/test/Transforms/SampleProfile/section-accurate-samplepgo.ll
+++ b/llvm/test/Transforms/SampleProfile/section-accurate-samplepgo.ll
@@ -1,7 +1,7 @@
 ; REQUIRES: x86-registered-target
-; RUN: opt -S %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof | opt -S -codegenprepare | FileCheck %s
-; RUN: opt -S %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof | opt -S -codegenprepare -profile-unknown-in-special-section -partial-profile | FileCheck %s --check-prefix=UNKNOWN
-; RUN: opt -S %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof -profile-sample-accurate -S | opt -S -codegenprepare | FileCheck %s --check-prefix=ACCURATE
+; RUN: opt -S %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof | opt -S -passes='require<profile-summary>,function(codegenprepare)' | FileCheck %s
+; RUN: opt -S %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof | opt -S -passes='require<profile-summary>,function(codegenprepare)' -profile-unknown-in-special-section -partial-profile | FileCheck %s --check-prefix=UNKNOWN
+; RUN: opt -S %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof -profile-sample-accurate -S | opt -S -passes='require<profile-summary>,function(codegenprepare)' | FileCheck %s --check-prefix=ACCURATE
 
 target triple = "x86_64-pc-linux-gnu"
 
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index b6068513d2306..c649e6ecddc08 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -426,7 +426,7 @@ int main(int argc, char **argv) {
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
   initializeSelectOptimizePass(Registry);
   initializeCallBrPreparePass(Registry);
-  initializeCodeGenPreparePass(Registry);
+  initializeCodeGenPrepareLegacyPassPass(Registry);
   initializeAtomicExpandPass(Registry);
   initializeWinEHPreparePass(Registry);
   initializeDwarfEHPrepareLegacyPassPass(Registry);